mirror of
https://github.com/jambonz/speech-utils.git
synced 2025-12-19 03:37:49 +00:00
Merge pull request #120 from jambonz/feat/resemble_tts
support resemble ai
This commit is contained in:
@@ -98,7 +98,7 @@ async function synthAudio(client, createHash, retrieveHash, logger, stats, { acc
|
|||||||
logger = logger || noopLogger;
|
logger = logger || noopLogger;
|
||||||
|
|
||||||
assert.ok(['google', 'aws', 'polly', 'microsoft', 'wellsaid', 'nuance', 'nvidia', 'ibm', 'elevenlabs',
|
assert.ok(['google', 'aws', 'polly', 'microsoft', 'wellsaid', 'nuance', 'nvidia', 'ibm', 'elevenlabs',
|
||||||
'whisper', 'deepgram', 'playht', 'rimelabs', 'verbio', 'cartesia', 'inworld'].includes(vendor) ||
|
'whisper', 'deepgram', 'playht', 'rimelabs', 'verbio', 'cartesia', 'inworld', 'resemble'].includes(vendor) ||
|
||||||
vendor.startsWith('custom'),
|
vendor.startsWith('custom'),
|
||||||
`synthAudio supported vendors are google, aws, microsoft, nuance, nvidia and wellsaid ..etc, not ${vendor}`);
|
`synthAudio supported vendors are google, aws, microsoft, nuance, nvidia and wellsaid ..etc, not ${vendor}`);
|
||||||
if ('google' === vendor) {
|
if ('google' === vendor) {
|
||||||
@@ -166,6 +166,9 @@ async function synthAudio(client, createHash, retrieveHash, logger, stats, { acc
|
|||||||
} else if ('cartesia' === vendor) {
|
} else if ('cartesia' === vendor) {
|
||||||
assert.ok(credentials.api_key, 'synthAudio requires api_key when cartesia is used');
|
assert.ok(credentials.api_key, 'synthAudio requires api_key when cartesia is used');
|
||||||
assert.ok(credentials.model_id, 'synthAudio requires model_id when cartesia is used');
|
assert.ok(credentials.model_id, 'synthAudio requires model_id when cartesia is used');
|
||||||
|
} else if (vendor === 'resemble') {
|
||||||
|
assert.ok(voice, 'synthAudio requires voice when resemble is used');
|
||||||
|
assert.ok(credentials.api_key, 'synthAudio requires api_key when resemble is used');
|
||||||
}
|
}
|
||||||
|
|
||||||
const key = makeSynthKey({
|
const key = makeSynthKey({
|
||||||
@@ -263,6 +266,10 @@ async function synthAudio(client, createHash, retrieveHash, logger, stats, { acc
|
|||||||
audioData = await synthDeepgram(logger, {credentials, stats, model, text,
|
audioData = await synthDeepgram(logger, {credentials, stats, model, text,
|
||||||
renderForCaching, disableTtsStreaming});
|
renderForCaching, disableTtsStreaming});
|
||||||
break;
|
break;
|
||||||
|
case 'resemble':
|
||||||
|
audioData = await synthResemble(logger, {
|
||||||
|
credentials, stats, voice, text, options, renderForCaching, disableTtsStreaming});
|
||||||
|
break;
|
||||||
case vendor.startsWith('custom') ? vendor : 'cant_match_value':
|
case vendor.startsWith('custom') ? vendor : 'cant_match_value':
|
||||||
audioData = await synthCustomVendor(logger,
|
audioData = await synthCustomVendor(logger,
|
||||||
{credentials, stats, language, voice, text});
|
{credentials, stats, language, voice, text});
|
||||||
@@ -1285,6 +1292,61 @@ const synthCartesia = async(logger, {
|
|||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
const synthResemble = async(logger, {
|
||||||
|
credentials, options, stats, voice, text, renderForCaching, disableTtsStreaming
|
||||||
|
}) => {
|
||||||
|
const {api_key, resemble_tts_uri} = credentials;
|
||||||
|
const {project_uuid, use_hd} = options || {};
|
||||||
|
|
||||||
|
/* default to using the streaming interface, unless disabled by env var OR we want just a cache file */
|
||||||
|
if (!JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) {
|
||||||
|
let params = '';
|
||||||
|
params += `{api_key=${api_key}`;
|
||||||
|
params += ',vendor=resemble';
|
||||||
|
params += `,voice=${voice}`;
|
||||||
|
params += ',write_cache_file=1';
|
||||||
|
if (project_uuid) params += `,project_uuid=${project_uuid}`;
|
||||||
|
if (use_hd) params += `,use_hd=${use_hd}`;
|
||||||
|
if (resemble_tts_uri) params += `,endpoint=${resemble_tts_uri}`;
|
||||||
|
|
||||||
|
params += '}';
|
||||||
|
|
||||||
|
return {
|
||||||
|
filePath: `say:${params}${text.replace(/\n/g, ' ').replace(/\r/g, ' ')}`,
|
||||||
|
servedFromCache: false,
|
||||||
|
rtt: 0
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
const baseUrl = resemble_tts_uri || 'https://f.cluster.resemble.ai';
|
||||||
|
const post = bent(baseUrl, 'POST', 'buffer', {
|
||||||
|
'Authorization': `Bearer ${api_key}`,
|
||||||
|
'Content-Type': 'application/json'
|
||||||
|
});
|
||||||
|
const response = await post('/synthesize', {
|
||||||
|
voice_uuid: voice,
|
||||||
|
data: text,
|
||||||
|
sample_rate: 8000,
|
||||||
|
output_format: 'mp3',
|
||||||
|
...(project_uuid && {project_uuid}),
|
||||||
|
...(use_hd && {use_hd}),
|
||||||
|
});
|
||||||
|
|
||||||
|
const json = JSON.parse(response.toString('utf8'));
|
||||||
|
const audioContent = Buffer.from(json.audio_content, 'base64');
|
||||||
|
return {
|
||||||
|
audioContent,
|
||||||
|
extension: 'mp3',
|
||||||
|
sampleRate: 8000
|
||||||
|
};
|
||||||
|
} catch (err) {
|
||||||
|
logger.info({err}, 'synth Elevenlabs returned error');
|
||||||
|
stats.increment('tts.count', ['vendor:elevenlabs', 'accepted:no']);
|
||||||
|
throw err;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
const getFileExtFromMime = (mime) => {
|
const getFileExtFromMime = (mime) => {
|
||||||
switch (mime) {
|
switch (mime) {
|
||||||
case 'audio/wav':
|
case 'audio/wav':
|
||||||
|
|||||||
@@ -750,6 +750,35 @@ test('inworld speech synth', async(t) => {
|
|||||||
client.quit();
|
client.quit();
|
||||||
});
|
});
|
||||||
|
|
||||||
|
test('resemble speech synth', async(t) => {
|
||||||
|
const fn = require('..');
|
||||||
|
const {synthAudio, client} = fn(opts, logger);
|
||||||
|
|
||||||
|
if (!process.env.RESEMBLE_API_KEY) {
|
||||||
|
t.pass('skipping resemble speech synth tests since RESEMBLE_API_KEY is not provided');
|
||||||
|
return t.end();
|
||||||
|
}
|
||||||
|
const text = '<speak prompt="Speak in an excited, upbeat tone">Hello from Resemble!</speak>';
|
||||||
|
try {
|
||||||
|
const opts = await synthAudio(stats, {
|
||||||
|
vendor: 'resemble',
|
||||||
|
credentials: {
|
||||||
|
api_key: process.env.RESEMBLE_API_KEY,
|
||||||
|
},
|
||||||
|
language: 'en',
|
||||||
|
voice: '3f5fb9f1',
|
||||||
|
text,
|
||||||
|
renderForCaching: true
|
||||||
|
});
|
||||||
|
t.ok(!opts.servedFromCache, `successfully synthesized resemble audio to ${opts.filePath}`);
|
||||||
|
|
||||||
|
} catch (err) {
|
||||||
|
console.error(JSON.stringify(err));
|
||||||
|
t.end(err);
|
||||||
|
}
|
||||||
|
client.quit();
|
||||||
|
});
|
||||||
|
|
||||||
test('rimelabs speech synth tests mist', async(t) => {
|
test('rimelabs speech synth tests mist', async(t) => {
|
||||||
const fn = require('..');
|
const fn = require('..');
|
||||||
const {synthAudio, client} = fn(opts, logger);
|
const {synthAudio, client} = fn(opts, logger);
|
||||||
|
|||||||
Reference in New Issue
Block a user