mirror of
https://github.com/jambonz/jambonz-api-server.git
synced 2026-01-25 02:08:24 +00:00
support resemble TTS (#488)
* support resemble TTS * wip * wip * update speech utils version * update resemble voice list
This commit is contained in:
@@ -21,6 +21,7 @@ const TtsPlayHtLanguagesVoices = require('./speech-data/tts-playht');
|
||||
const TtsVerbioLanguagesVoices = require('./speech-data/tts-verbio');
|
||||
const TtsInworldLanguagesVoices = require('./speech-data/tts-inworld');
|
||||
const ttsCartesia = require('./speech-data/tts-cartesia');
|
||||
const TtsResembleLanguagesVoices = require('./speech-data/tts-resemble');
|
||||
|
||||
const TtsModelDeepgram = require('./speech-data/tts-model-deepgram');
|
||||
const TtsLanguagesDeepgram = require('./speech-data/tts-deepgram');
|
||||
@@ -424,6 +425,24 @@ const testWhisper = async(logger, synthAudio, credentials) => {
|
||||
}
|
||||
};
|
||||
|
||||
const testResembleTTS = async(logger, synthAudio, credentials) => {
|
||||
try {
|
||||
await synthAudio({increment: () => {}, histogram: () => {}},
|
||||
{
|
||||
vendor: 'resemble',
|
||||
credentials,
|
||||
language: 'en-US',
|
||||
voice: '3f5fb9f1',
|
||||
text: 'Hi there and welcome to jambones!',
|
||||
renderForCaching: true
|
||||
}
|
||||
);
|
||||
} catch (err) {
|
||||
logger.info({err}, 'synth resemble returned error');
|
||||
throw err;
|
||||
}
|
||||
};
|
||||
|
||||
const testDeepgramTTS = async(logger, synthAudio, credentials) => {
|
||||
try {
|
||||
await synthAudio({increment: () => {}, histogram: () => {}},
|
||||
@@ -729,6 +748,11 @@ function decryptCredential(obj, credential, logger, isObscureKey = true) {
|
||||
const o = JSON.parse(decrypt(credential));
|
||||
obj.api_key = isObscureKey ? obscureKey(o.api_key) : o.api_key;
|
||||
obj.service_version = o.service_version;
|
||||
} else if ('resemble' === obj.vendor) {
|
||||
const o = JSON.parse(decrypt(credential));
|
||||
obj.api_key = isObscureKey ? obscureKey(o.api_key) : o.api_key;
|
||||
obj.resemble_tts_uri = o.resemble_tts_uri;
|
||||
obj.resemble_tts_use_tls = o.resemble_tts_use_tls;
|
||||
} else if ('voxist' === obj.vendor) {
|
||||
const o = JSON.parse(decrypt(credential));
|
||||
obj.api_key = isObscureKey ? obscureKey(o.api_key) : o.api_key;
|
||||
@@ -799,6 +823,8 @@ async function getLanguagesAndVoicesForVendor(logger, vendor, credential, getTts
|
||||
return await getLanguagesVoicesForRimelabs(credential, getTtsVoices, logger);
|
||||
case 'inworld':
|
||||
return await getLanguagesVoicesForInworld(credential, getTtsVoices, logger);
|
||||
case 'resemble':
|
||||
return await getLanguagesAndVoicesForResemble(credential, getTtsVoices, logger);
|
||||
case 'assemblyai':
|
||||
return await getLanguagesVoicesForAssemblyAI(credential, getTtsVoices, logger);
|
||||
case 'voxist':
|
||||
@@ -1240,6 +1266,82 @@ async function getLanguagesVoicesForVerbio(credentials, getTtsVoices, logger) {
|
||||
}
|
||||
}
|
||||
|
||||
async function getLanguagesAndVoicesForResemble(credential, getTtsVoices, logger) {
|
||||
if (credential) {
|
||||
try {
|
||||
const {api_key} = credential;
|
||||
let allVoices = [];
|
||||
let page = 1;
|
||||
let hasMorePages = true;
|
||||
// Fetch all pages of voices
|
||||
while (hasMorePages) {
|
||||
const response = await fetch(`https://app.resemble.ai/api/v2/voices?page=${page}&page_size=100`, {
|
||||
headers: {
|
||||
'Authorization': `Token token=${api_key}`,
|
||||
'Accept': 'application/json'
|
||||
}
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error('failed to list voices');
|
||||
}
|
||||
|
||||
const data = await response.json();
|
||||
|
||||
if (!data.success) {
|
||||
throw new Error('API returned unsuccessful response');
|
||||
}
|
||||
|
||||
allVoices = allVoices.concat(data.items);
|
||||
|
||||
// Check if there are more pages
|
||||
hasMorePages = page < data.num_pages;
|
||||
page++;
|
||||
}
|
||||
|
||||
// Filter only finished voices that support text_to_speech
|
||||
const availableVoices = allVoices.filter((voice) =>
|
||||
voice.status === 'finished' &&
|
||||
voice.component_status?.text_to_speech?.status === 'ready'
|
||||
);
|
||||
|
||||
// Group voices by language
|
||||
const ttsVoices = availableVoices.reduce((acc, voice) => {
|
||||
const languageCode = voice.default_language || 'en-US';
|
||||
const existingLanguage = acc.find((lang) => lang.value === languageCode);
|
||||
|
||||
const voiceEntry = {
|
||||
name: `${voice.name} (${voice.voice_type}) - ${voice.source}`,
|
||||
value: voice.uuid
|
||||
};
|
||||
|
||||
if (existingLanguage) {
|
||||
existingLanguage.voices.push(voiceEntry);
|
||||
} else {
|
||||
|
||||
acc.push({
|
||||
value: languageCode,
|
||||
name: capitalizeFirst(languageCode),
|
||||
voices: [voiceEntry]
|
||||
});
|
||||
}
|
||||
|
||||
return acc;
|
||||
}, []);
|
||||
// Sort languages and voices
|
||||
ttsVoices.sort((a, b) => a.name.localeCompare(b.name));
|
||||
ttsVoices.forEach((lang) => {
|
||||
lang.voices.sort((a, b) => a.name.localeCompare(b.name));
|
||||
});
|
||||
return tranform(ttsVoices);
|
||||
} catch (err) {
|
||||
logger.info('Error while fetching Resemble languages, voices, return predefined values', err);
|
||||
}
|
||||
}
|
||||
|
||||
return tranform(TtsResembleLanguagesVoices);
|
||||
}
|
||||
|
||||
function tranform(tts, stt, models, sttModels) {
|
||||
return {
|
||||
...(tts && {tts}),
|
||||
@@ -1528,5 +1630,6 @@ module.exports = {
|
||||
testSpeechmaticsStt,
|
||||
testCartesia,
|
||||
testVoxistStt,
|
||||
testOpenAiStt
|
||||
testOpenAiStt,
|
||||
testResembleTTS
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user