mirror of
https://github.com/jambonz/speech-utils.git
synced 2025-12-19 03:37:49 +00:00
remove audio extension from audio key
This commit is contained in:
@@ -3,6 +3,27 @@ const {noopLogger, makeSynthKey} = require('./utils');
|
||||
const {JAMBONES_TTS_CACHE_DURATION_MINS} = require('./config');
|
||||
const EXPIRES = JAMBONES_TTS_CACHE_DURATION_MINS;
|
||||
|
||||
function getExtensionAndSampleRate(path) {
|
||||
const match = path.match(/\.([^.]*)$/);
|
||||
if (!match) {
|
||||
//default should be wav file.
|
||||
return ['wav', 8000];
|
||||
}
|
||||
|
||||
const extension = match[1];
|
||||
const sampleRateMap = {
|
||||
r8: 8000,
|
||||
r16: 16000,
|
||||
r24: 24000,
|
||||
r44: 44100,
|
||||
r48: 48000,
|
||||
r96: 96000,
|
||||
};
|
||||
|
||||
const sampleRate = sampleRateMap[extension] || 8000;
|
||||
return [extension, sampleRate];
|
||||
}
|
||||
|
||||
async function addFileToCache(client, logger, path,
|
||||
{account_sid, vendor, language, voice, deploymentId, engine, text}) {
|
||||
let key;
|
||||
@@ -17,8 +38,15 @@ async function addFileToCache(client, logger, path,
|
||||
engine,
|
||||
text,
|
||||
});
|
||||
const [extension, sampleRate] = getExtensionAndSampleRate(path);
|
||||
const audioBuffer = await fs.readFile(path);
|
||||
await client.setex(key, EXPIRES, audioBuffer.toString('base64'));
|
||||
await client.setex(key, EXPIRES, JSON.stringify(
|
||||
{
|
||||
audioContent: audioBuffer.toString('base64'),
|
||||
extension,
|
||||
sampleRate
|
||||
}
|
||||
));
|
||||
} catch (err) {
|
||||
logger.error(err, 'addFileToCache: Error');
|
||||
return;
|
||||
|
||||
@@ -46,7 +46,7 @@ const {
|
||||
JAMBONES_HTTP_PROXY_IP,
|
||||
JAMBONES_HTTP_PROXY_PORT,
|
||||
JAMBONES_TTS_CACHE_DURATION_MINS,
|
||||
JAMBONES_EAGERLY_PRE_CACHE_AUDIO,
|
||||
JAMBONES_TTS_TRIM_SILENCE,
|
||||
} = require('./config');
|
||||
const EXPIRES = JAMBONES_TTS_CACHE_DURATION_MINS;
|
||||
const OpenAI = require('openai');
|
||||
@@ -91,7 +91,7 @@ async function synthAudio(client, createHash, retrieveHash, logger, stats, { acc
|
||||
vendor, language, voice, gender, text, engine, salt, model, credentials, deploymentId,
|
||||
disableTtsCache, renderForCaching = false, disableTtsStreaming, options
|
||||
}) {
|
||||
let audioBuffer;
|
||||
let audioData;
|
||||
let servedFromCache = false;
|
||||
let rtt;
|
||||
logger = logger || noopLogger;
|
||||
@@ -172,54 +172,22 @@ async function synthAudio(client, createHash, retrieveHash, logger, stats, { acc
|
||||
text,
|
||||
renderForCaching
|
||||
});
|
||||
let filePath;
|
||||
// used only for custom vendor
|
||||
let fileExtension;
|
||||
filePath = makeFilePath({vendor, voice, key, salt, renderForCaching});
|
||||
|
||||
debug(`synth key is ${key}`);
|
||||
let cached;
|
||||
if (!disableTtsCache) {
|
||||
cached = await client.get(key);
|
||||
/**
|
||||
* If we are using tts streaming and also precaching audio, audio could have been cached by streaming (r8)
|
||||
* or here in speech-utils due to precaching (mp3), so we need to check for both keys.
|
||||
*/
|
||||
if (!cached && JAMBONES_EAGERLY_PRE_CACHE_AUDIO) {
|
||||
const preCachekey = makeSynthKey({
|
||||
account_sid,
|
||||
vendor,
|
||||
language: language || '',
|
||||
voice: voice || deploymentId,
|
||||
engine,
|
||||
text,
|
||||
renderForCaching: true
|
||||
});
|
||||
cached = await client.get(preCachekey);
|
||||
if (cached) {
|
||||
// Precache audio is available update filpath with precache file extension.
|
||||
filePath = makeFilePath({vendor, voice, key, salt, renderForCaching: true});
|
||||
}
|
||||
}
|
||||
}
|
||||
if (cached) {
|
||||
// found in cache - extend the expiry and use it
|
||||
debug('result WAS found in cache');
|
||||
servedFromCache = true;
|
||||
stats.increment('tts.cache.requests', ['found:yes']);
|
||||
if (vendor.startsWith('custom')) {
|
||||
// custom vendors support multiple mime types such as: mp3, wav, r8, r16 ...etc,
|
||||
// mime type/file extension is available when http response has header Content-type.
|
||||
// In cache, file extension is store together with audiBuffer in a json.
|
||||
// Normal cache audio will be base64 string
|
||||
const payload = JSON.parse(cached);
|
||||
filePath = filePath.replace(/\.[^\.]*$/g, payload.fileExtension);
|
||||
audioBuffer = Buffer.from(payload.audioBuffer, 'base64');
|
||||
} else {
|
||||
audioBuffer = Buffer.from(cached, 'base64');
|
||||
}
|
||||
audioData = JSON.parse(cached);
|
||||
// convert base64 audio to buffer
|
||||
audioData.audioContent = Buffer.from(audioData.audioContent, 'base64');
|
||||
client.expire(key, EXPIRES).catch((err) => logger.info(err, 'Error setting expires'));
|
||||
}
|
||||
if (!cached) {
|
||||
} else {
|
||||
// not found in cache - go get it from speech vendor and add to cache
|
||||
debug('result was NOT found in cache');
|
||||
stats.increment('tts.cache.requests', ['found:no']);
|
||||
@@ -227,94 +195,92 @@ async function synthAudio(client, createHash, retrieveHash, logger, stats, { acc
|
||||
const startAt = process.hrtime();
|
||||
switch (vendor) {
|
||||
case 'google':
|
||||
audioBuffer = await synthGoogle(logger, {credentials, stats, language, voice, gender, text});
|
||||
audioData = await synthGoogle(logger, {credentials, stats, language, voice, gender, text});
|
||||
break;
|
||||
case 'aws':
|
||||
case 'polly':
|
||||
vendorLabel = 'aws';
|
||||
audioBuffer = await synthPolly(createHash, retrieveHash, logger,
|
||||
audioData = await synthPolly(createHash, retrieveHash, logger,
|
||||
{credentials, stats, language, voice, text, engine});
|
||||
break;
|
||||
case 'azure':
|
||||
case 'microsoft':
|
||||
vendorLabel = 'microsoft';
|
||||
audioBuffer = await synthMicrosoft(logger, {credentials, stats, language, voice, text, deploymentId,
|
||||
filePath, renderForCaching, disableTtsStreaming});
|
||||
audioData = await synthMicrosoft(logger, {credentials, stats, language, voice, text, deploymentId,
|
||||
renderForCaching, disableTtsStreaming});
|
||||
break;
|
||||
case 'nuance':
|
||||
model = model || 'enhanced';
|
||||
audioBuffer = await synthNuance(client, logger, {credentials, stats, voice, model, text});
|
||||
audioData = await synthNuance(client, logger, {credentials, stats, voice, model, text});
|
||||
break;
|
||||
case 'nvidia':
|
||||
audioBuffer = await synthNvidia(client, logger, {credentials, stats, language, voice, model, text});
|
||||
audioData = await synthNvidia(client, logger, {credentials, stats, language, voice, model, text});
|
||||
break;
|
||||
case 'ibm':
|
||||
audioBuffer = await synthIbm(logger, {credentials, stats, voice, text});
|
||||
audioData = await synthIbm(logger, {credentials, stats, voice, text});
|
||||
break;
|
||||
case 'wellsaid':
|
||||
audioBuffer = await synthWellSaid(logger, {credentials, stats, language, voice, text, filePath});
|
||||
audioData = await synthWellSaid(logger, {credentials, stats, language, voice, text});
|
||||
break;
|
||||
case 'elevenlabs':
|
||||
audioBuffer = await synthElevenlabs(logger, {
|
||||
credentials, options, stats, language, voice, text, renderForCaching, disableTtsStreaming, filePath
|
||||
});
|
||||
audioData = await synthElevenlabs(logger, {
|
||||
credentials, options, stats, language, voice, text, renderForCaching, disableTtsStreaming});
|
||||
break;
|
||||
case 'playht':
|
||||
audioBuffer = await synthPlayHT(client, logger, {
|
||||
credentials, options, stats, language, voice, text, renderForCaching, disableTtsStreaming, filePath
|
||||
});
|
||||
audioData = await synthPlayHT(client, logger, {
|
||||
credentials, options, stats, language, voice, text, renderForCaching, disableTtsStreaming});
|
||||
break;
|
||||
case 'cartesia':
|
||||
audioBuffer = await synthCartesia(logger, {
|
||||
credentials, options, stats, language, voice, text, renderForCaching, disableTtsStreaming, filePath
|
||||
});
|
||||
audioData = await synthCartesia(logger, {
|
||||
credentials, options, stats, language, voice, text, renderForCaching, disableTtsStreaming});
|
||||
break;
|
||||
case 'rimelabs':
|
||||
audioBuffer = await synthRimelabs(logger, {
|
||||
credentials, options, stats, language, voice, text, renderForCaching, disableTtsStreaming, filePath
|
||||
});
|
||||
audioData = await synthRimelabs(logger, {
|
||||
credentials, options, stats, language, voice, text, renderForCaching, disableTtsStreaming});
|
||||
break;
|
||||
case 'whisper':
|
||||
audioBuffer = await synthWhisper(logger, {
|
||||
audioData = await synthWhisper(logger, {
|
||||
credentials, stats, voice, text, renderForCaching, disableTtsStreaming});
|
||||
break;
|
||||
case 'verbio':
|
||||
audioBuffer = await synthVerbio(client, logger, {
|
||||
audioData = await synthVerbio(client, logger, {
|
||||
credentials, stats, voice, text, renderForCaching, disableTtsStreaming});
|
||||
if (audioBuffer?.filePath) return audioBuffer;
|
||||
if (audioData?.filePath) return audioData;
|
||||
break;
|
||||
case 'deepgram':
|
||||
audioBuffer = await synthDeepgram(logger, {credentials, stats, model, text,
|
||||
audioData = await synthDeepgram(logger, {credentials, stats, model, text,
|
||||
renderForCaching, disableTtsStreaming});
|
||||
break;
|
||||
case vendor.startsWith('custom') ? vendor : 'cant_match_value':
|
||||
({ audioBuffer, filePath, fileExtension } = await synthCustomVendor(logger,
|
||||
{credentials, stats, language, voice, text, filePath}));
|
||||
audioData = await synthCustomVendor(logger,
|
||||
{credentials, stats, language, voice, text});
|
||||
break;
|
||||
default:
|
||||
assert(`synthAudio: unsupported speech vendor ${vendor}`);
|
||||
}
|
||||
if ('filePath' in audioBuffer) return audioBuffer;
|
||||
if ('filePath' in audioData) return audioData;
|
||||
const diff = process.hrtime(startAt);
|
||||
const time = diff[0] * 1e3 + diff[1] * 1e-6;
|
||||
rtt = time.toFixed(0);
|
||||
stats.histogram('tts.response_time', rtt, [`vendor:${vendorLabel}`]);
|
||||
debug(`tts rtt time for ${text.length} chars on ${vendorLabel}: ${rtt}`);
|
||||
logger.info(`tts rtt time for ${text.length} chars on ${vendorLabel}: ${rtt}`);
|
||||
|
||||
const base64Audio = audioBuffer.toString('base64');
|
||||
const cacheContent = vendor.startsWith('custom') ?
|
||||
JSON.stringify({
|
||||
audioBuffer: base64Audio,
|
||||
fileExtension
|
||||
}) : base64Audio;
|
||||
|
||||
client.setex(key, EXPIRES, cacheContent)
|
||||
// Save audio json to cache
|
||||
client.setex(key, EXPIRES, JSON.stringify({
|
||||
...audioData,
|
||||
audioContent: audioData.audioContent?.toString('base64')
|
||||
}))
|
||||
.catch((err) => logger.error(err, `error calling setex on key ${key}`));
|
||||
}
|
||||
|
||||
return new Promise((resolve, reject) => {
|
||||
fs.writeFile(filePath, audioBuffer, (err) => {
|
||||
const { audioContent, extension } = audioData;
|
||||
const filePath = makeFilePath({
|
||||
key,
|
||||
salt,
|
||||
extension
|
||||
});
|
||||
fs.writeFile(filePath, audioContent, (err) => {
|
||||
if (err) return reject(err);
|
||||
resolve({filePath, servedFromCache, rtt});
|
||||
});
|
||||
@@ -369,7 +335,13 @@ const synthPolly = async(createHash, retrieveHash, logger,
|
||||
.on('data', (chunk) => {
|
||||
chunks.push(chunk);
|
||||
})
|
||||
.on('end', () => resolve(Buffer.concat(chunks)));
|
||||
.on('end', () => resolve(
|
||||
{
|
||||
audioContent: Buffer.concat(chunks),
|
||||
extension: 'mp3',
|
||||
sampleRate: 8000
|
||||
}
|
||||
));
|
||||
});
|
||||
} catch (err) {
|
||||
logger.info({err}, 'synthAudio: Error synthesizing speech using aws polly');
|
||||
@@ -411,7 +383,11 @@ const synthGoogle = async(logger, {credentials, stats, language, voice, gender,
|
||||
};
|
||||
|
||||
const wav = await post('/v1beta1/text:synthesize', payload);
|
||||
return Buffer.from(wav.audioContent, 'base64');
|
||||
return {
|
||||
audioContent: Buffer.from(wav.audioContent, 'base64'),
|
||||
extension: 'wav',
|
||||
sampleRate: 24000
|
||||
};
|
||||
} catch (err) {
|
||||
logger.info({err: await err.text()}, 'synthGoogle returned error');
|
||||
throw err;
|
||||
@@ -432,7 +408,11 @@ const synthGoogle = async(logger, {credentials, stats, language, voice, gender,
|
||||
const responses = await client.synthesizeSpeech(opts);
|
||||
stats.increment('tts.count', ['vendor:google', 'accepted:yes']);
|
||||
client.close();
|
||||
return responses[0].audioContent;
|
||||
return {
|
||||
audioContent: responses[0].audioContent,
|
||||
extension: 'mp3',
|
||||
sampleRate: 8000
|
||||
};
|
||||
} catch (err) {
|
||||
console.error(err);
|
||||
logger.info({err, opts}, 'synthAudio: Error synthesizing speech using google');
|
||||
@@ -463,7 +443,11 @@ const synthIbm = async(logger, {credentials, stats, voice, text}) => {
|
||||
for await (const chunk of r.result) {
|
||||
chunks.push(chunk);
|
||||
}
|
||||
return Buffer.concat(chunks);
|
||||
return {
|
||||
audioContent: Buffer.concat(chunks),
|
||||
extension: 'mp3',
|
||||
sampleRate: 8000
|
||||
};
|
||||
} catch (err) {
|
||||
logger.info({err, params}, 'synthAudio: Error synthesizing speech using ibm');
|
||||
stats.increment('tts.count', ['vendor:ibm', 'accepted:no']);
|
||||
@@ -473,11 +457,9 @@ const synthIbm = async(logger, {credentials, stats, voice, text}) => {
|
||||
|
||||
async function _synthOnPremMicrosoft(logger, {
|
||||
credentials,
|
||||
stats,
|
||||
language,
|
||||
voice,
|
||||
text,
|
||||
filePath
|
||||
text
|
||||
}) {
|
||||
const {use_custom_tts, custom_tts_endpoint_url, api_key} = credentials;
|
||||
let content = text;
|
||||
@@ -499,15 +481,19 @@ async function _synthOnPremMicrosoft(logger, {
|
||||
}
|
||||
|
||||
try {
|
||||
const trimSilence = filePath.endsWith('.r8');
|
||||
const trimSilence = JAMBONES_TTS_TRIM_SILENCE;
|
||||
const post = bent('POST', 'buffer', {
|
||||
'X-Microsoft-OutputFormat': trimSilence ? 'raw-8khz-16bit-mono-pcm' : 'audio-16khz-32kbitrate-mono-mp3',
|
||||
'Content-Type': 'application/ssml+xml',
|
||||
'User-Agent': 'Jambonz',
|
||||
...(api_key && {'Ocp-Apim-Subscription-Key': api_key})
|
||||
});
|
||||
const mp3 = await post(custom_tts_endpoint_url, content);
|
||||
return mp3;
|
||||
const audioContent = await post(custom_tts_endpoint_url, content);
|
||||
return {
|
||||
audioContent,
|
||||
extension: trimSilence ? 'r8' : 'mp3',
|
||||
sampleRate: 8000
|
||||
};
|
||||
} catch (err) {
|
||||
logger.info({err}, '_synthMicrosoftByHttp returned error');
|
||||
throw err;
|
||||
@@ -520,7 +506,6 @@ const synthMicrosoft = async(logger, {
|
||||
language,
|
||||
voice,
|
||||
text,
|
||||
filePath,
|
||||
renderForCaching,
|
||||
disableTtsStreaming
|
||||
}) => {
|
||||
@@ -563,17 +548,18 @@ const synthMicrosoft = async(logger, {
|
||||
rtt: 0
|
||||
};
|
||||
}
|
||||
// Azure Onprem
|
||||
if (use_custom_tts && custom_tts_endpoint_url) {
|
||||
return await _synthOnPremMicrosoft(logger, {
|
||||
credentials,
|
||||
stats,
|
||||
language,
|
||||
voice,
|
||||
text,
|
||||
filePath
|
||||
text
|
||||
});
|
||||
}
|
||||
const trimSilence = filePath.endsWith('.r8');
|
||||
// Azure hosted service
|
||||
const trimSilence = JAMBONES_TTS_TRIM_SILENCE;
|
||||
const speechConfig = SpeechConfig.fromSubscription(apiKey, region);
|
||||
speechConfig.speechSynthesisLanguage = language;
|
||||
speechConfig.speechSynthesisVoiceName = voice;
|
||||
@@ -608,7 +594,11 @@ const synthMicrosoft = async(logger, {
|
||||
case ResultReason.SynthesizingAudioCompleted:
|
||||
let buffer = Buffer.from(result.audioData);
|
||||
if (trimSilence) buffer = trimTrailingSilence(buffer);
|
||||
resolve(buffer);
|
||||
resolve({
|
||||
audioContent: buffer,
|
||||
extension: trimSilence ? 'r8' : 'mp3',
|
||||
sampleRate: 8000
|
||||
});
|
||||
synthesizer.close();
|
||||
stats.increment('tts.count', ['vendor:microsoft', 'accepted:yes']);
|
||||
break;
|
||||
@@ -638,11 +628,15 @@ const synthWellSaid = async(logger, {credentials, stats, language, voice, gender
|
||||
'Accept': 'audio/mpeg',
|
||||
'Content-Type': 'application/json'
|
||||
});
|
||||
const mp3 = await post('/v1/tts/stream', {
|
||||
const audioContent = await post('/v1/tts/stream', {
|
||||
text,
|
||||
speaker_id: voice
|
||||
});
|
||||
return mp3;
|
||||
return {
|
||||
audioContent,
|
||||
extension: 'mp3',
|
||||
sampleRate: 8000
|
||||
};
|
||||
} catch (err) {
|
||||
logger.info({err}, 'testWellSaidTts returned error');
|
||||
throw err;
|
||||
@@ -679,8 +673,8 @@ const synthNuance = async(client, logger, {credentials, stats, voice, model, tex
|
||||
t.setText(text);
|
||||
input.setText(t);
|
||||
}
|
||||
|
||||
pcm.setSampleRateHz(8000);
|
||||
const sampleRate = 8000;
|
||||
pcm.setSampleRateHz(sampleRate);
|
||||
f.setPcm(pcm);
|
||||
p.setAudioFormat(f);
|
||||
v.setName(voice);
|
||||
@@ -704,7 +698,11 @@ const synthNuance = async(client, logger, {credentials, stats, voice, model, tex
|
||||
const details = status.getDetails();
|
||||
return reject({code, message, details});
|
||||
}
|
||||
resolve(Buffer.from(response.getAudio()));
|
||||
resolve({
|
||||
audioContent: Buffer.from(response.getAudio()),
|
||||
extension: 'r8',
|
||||
sampleRate
|
||||
});
|
||||
});
|
||||
});
|
||||
};
|
||||
@@ -712,12 +710,13 @@ const synthNuance = async(client, logger, {credentials, stats, voice, model, tex
|
||||
const synthNvidia = async(client, logger, {credentials, stats, language, voice, model, text}) => {
|
||||
const {riva_server_uri} = credentials;
|
||||
let rivaClient, request;
|
||||
const sampleRate = 8000;
|
||||
try {
|
||||
rivaClient = await createRivaClient(riva_server_uri);
|
||||
request = new SynthesizeSpeechRequest();
|
||||
request.setVoiceName(voice);
|
||||
request.setLanguageCode(language);
|
||||
request.setSampleRateHz(8000);
|
||||
request.setSampleRateHz(sampleRate);
|
||||
request.setEncoding(AudioEncoding.LINEAR_PCM);
|
||||
request.setText(text);
|
||||
} catch (err) {
|
||||
@@ -731,7 +730,11 @@ const synthNvidia = async(client, logger, {credentials, stats, language, voice,
|
||||
logger.info({err, voice, language}, 'error synthesizing speech using Nvidia');
|
||||
return reject(err);
|
||||
}
|
||||
resolve(Buffer.from(response.getAudio()));
|
||||
resolve({
|
||||
audioContent: Buffer.from(response.getAudio()),
|
||||
extension: 'r8',
|
||||
sampleRate
|
||||
});
|
||||
});
|
||||
});
|
||||
};
|
||||
@@ -753,14 +756,13 @@ const synthCustomVendor = async(logger, {credentials, stats, language, voice, te
|
||||
text
|
||||
});
|
||||
|
||||
const regex = /\.[^\.]*$/g;
|
||||
const mime = response.headers['content-type'];
|
||||
const buffer = await response.arrayBuffer();
|
||||
const fileExtension = getFileExtFromMime(mime);
|
||||
const [extension, sampleRate] = getFileExtFromMime(mime);
|
||||
return {
|
||||
audioBuffer: buffer,
|
||||
filePath: filePath.replace(regex, fileExtension),
|
||||
fileExtension
|
||||
audioContent: buffer,
|
||||
extension,
|
||||
sampleRate
|
||||
};
|
||||
} catch (err) {
|
||||
logger.info({err}, `Vendor ${vendor} returned error`);
|
||||
@@ -806,7 +808,7 @@ const synthElevenlabs = async(logger, {
|
||||
'Accept': 'audio/mpeg',
|
||||
'Content-Type': 'application/json'
|
||||
});
|
||||
const mp3 = await post(`/v1/text-to-speech/${voice}${optimize_streaming_latency}`, {
|
||||
const audioContent = await post(`/v1/text-to-speech/${voice}${optimize_streaming_latency}`, {
|
||||
text,
|
||||
model_id,
|
||||
voice_settings: {
|
||||
@@ -815,7 +817,11 @@ const synthElevenlabs = async(logger, {
|
||||
},
|
||||
...opts
|
||||
});
|
||||
return mp3;
|
||||
return {
|
||||
audioContent,
|
||||
extension: 'mp3',
|
||||
sampleRate: 8000
|
||||
};
|
||||
} catch (err) {
|
||||
logger.info({err}, 'synth Elevenlabs returned error');
|
||||
stats.increment('tts.count', ['vendor:elevenlabs', 'accepted:no']);
|
||||
@@ -897,7 +903,7 @@ const synthPlayHT = async(client, logger, {
|
||||
'Content-Type': 'application/json'
|
||||
});
|
||||
|
||||
const mp3 = await post(synthesizeUrl, {
|
||||
const audioContent = await post(synthesizeUrl, {
|
||||
text,
|
||||
...(voice_engine === 'Play3.0' && { language }),
|
||||
voice,
|
||||
@@ -906,7 +912,11 @@ const synthPlayHT = async(client, logger, {
|
||||
sample_rate: 8000,
|
||||
...opts
|
||||
});
|
||||
return mp3;
|
||||
return {
|
||||
audioContent,
|
||||
extension: 'mp3',
|
||||
sampleRate: 8000
|
||||
};
|
||||
} catch (err) {
|
||||
logger.info({err}, 'synth PlayHT returned error');
|
||||
stats.increment('tts.count', ['vendor:playht', 'accepted:no']);
|
||||
@@ -945,14 +955,19 @@ const synthRimelabs = async(logger, {
|
||||
'Accept': 'audio/mp3',
|
||||
'Content-Type': 'application/json'
|
||||
});
|
||||
const mp3 = await post('/v1/rime-tts', {
|
||||
const sampleRate = 8000;
|
||||
const audioContent = await post('/v1/rime-tts', {
|
||||
speaker: voice,
|
||||
text,
|
||||
modelId: model_id,
|
||||
samplingRate: 8000,
|
||||
samplingRate: sampleRate,
|
||||
...opts
|
||||
});
|
||||
return mp3;
|
||||
return {
|
||||
audioContent,
|
||||
extension: 'mp3',
|
||||
sampleRate
|
||||
};
|
||||
} catch (err) {
|
||||
logger.info({err}, 'synth rimelabs returned error');
|
||||
stats.increment('tts.count', ['vendor:rimelabs', 'accepted:no']);
|
||||
@@ -986,13 +1001,17 @@ const synthVerbio = async(client, logger, {credentials, stats, voice, text, rend
|
||||
'User-Agent': 'jambonz',
|
||||
'Content-Type': 'application/json'
|
||||
});
|
||||
const r8 = await post('/api/v1/synthesize', {
|
||||
const audioContent = await post('/api/v1/synthesize', {
|
||||
voice_id: voice,
|
||||
output_sample_rate: '8k',
|
||||
output_encoding: 'pcm16',
|
||||
text
|
||||
});
|
||||
return r8;
|
||||
return {
|
||||
audioContent,
|
||||
extension: 'r8',
|
||||
sampleRate: 8000
|
||||
};
|
||||
} catch (err) {
|
||||
logger.info({err}, 'synth Verbio returned error');
|
||||
stats.increment('tts.count', ['vendor:verbio', 'accepted:no']);
|
||||
@@ -1032,7 +1051,11 @@ const synthWhisper = async(logger, {credentials, stats, voice, text, renderForCa
|
||||
input: text,
|
||||
response_format: 'mp3'
|
||||
});
|
||||
return Buffer.from(await mp3.arrayBuffer());
|
||||
return {
|
||||
audioContent: Buffer.from(await mp3.arrayBuffer()),
|
||||
extension: 'mp3',
|
||||
sampleRate: 8000
|
||||
};
|
||||
} catch (err) {
|
||||
logger.info({err}, 'synth whisper returned error');
|
||||
stats.increment('tts.count', ['vendor:openai', 'accepted:no']);
|
||||
@@ -1064,10 +1087,14 @@ const synthDeepgram = async(logger, {credentials, stats, model, text, renderForC
|
||||
'Accept': 'audio/mpeg',
|
||||
'Content-Type': 'application/json'
|
||||
});
|
||||
const mp3 = await post(`/v1/speak?model=${model}`, {
|
||||
const audioContent = await post(`/v1/speak?model=${model}`, {
|
||||
text
|
||||
});
|
||||
return mp3;
|
||||
return {
|
||||
audioContent,
|
||||
extension: 'mp3',
|
||||
sampleRate: 8000
|
||||
};
|
||||
} catch (err) {
|
||||
logger.info({err}, 'synth Deepgram returned error');
|
||||
stats.increment('tts.count', ['vendor:deepgram', 'accepted:no']);
|
||||
@@ -1104,6 +1131,7 @@ const synthCartesia = async(logger, {
|
||||
|
||||
try {
|
||||
const client = new CartesiaClient({ apiKey: api_key });
|
||||
const sampleRate = 48000;
|
||||
const mp3 = await client.tts.bytes({
|
||||
modelId: model_id,
|
||||
transcript: text,
|
||||
@@ -1119,7 +1147,7 @@ const synthCartesia = async(logger, {
|
||||
),
|
||||
...(opts.speed || opts.emotion && {
|
||||
experimentalControls: {
|
||||
...(opts.speed && {speed: opts.speed}),
|
||||
...(opts.speed !== null && opts.speed !== undefined && {speed: opts.speed}),
|
||||
...(opts.emotion && {emotion: opts.emotion}),
|
||||
}
|
||||
})
|
||||
@@ -1128,10 +1156,14 @@ const synthCartesia = async(logger, {
|
||||
outputFormat: {
|
||||
container: 'mp3',
|
||||
bitRate: 128000,
|
||||
sampleRate: 8000
|
||||
sampleRate
|
||||
},
|
||||
});
|
||||
return Buffer.from(mp3);
|
||||
return {
|
||||
audioContent: Buffer.from(mp3),
|
||||
extension: 'mp3',
|
||||
sampleRate
|
||||
};
|
||||
} catch (err) {
|
||||
logger.info({err}, 'synth Cartesia returned error');
|
||||
stats.increment('tts.count', ['vendor:cartesia', 'accepted:no']);
|
||||
@@ -1144,22 +1176,22 @@ const getFileExtFromMime = (mime) => {
|
||||
switch (mime) {
|
||||
case 'audio/wav':
|
||||
case 'audio/x-wav':
|
||||
return '.wav';
|
||||
return ['wav', 8000];
|
||||
case /audio\/l16.*rate=8000/.test(mime) ? mime : 'cant match value':
|
||||
return '.r8';
|
||||
return ['r8', 8000];
|
||||
case /audio\/l16.*rate=16000/.test(mime) ? mime : 'cant match value':
|
||||
return '.r16';
|
||||
return ['r16', 16000];
|
||||
case /audio\/l16.*rate=24000/.test(mime) ? mime : 'cant match value':
|
||||
return '.r24';
|
||||
return ['r24', 24000];
|
||||
case /audio\/l16.*rate=32000/.test(mime) ? mime : 'cant match value':
|
||||
return '.r32';
|
||||
return ['r32', 32000];
|
||||
case /audio\/l16.*rate=48000/.test(mime) ? mime : 'cant match value':
|
||||
return '.r48';
|
||||
return ['r48', 48000];
|
||||
case 'audio/mpeg':
|
||||
case 'audio/mp3':
|
||||
return '.mp3';
|
||||
return ['mp3', 8000];
|
||||
default:
|
||||
return '.wav';
|
||||
return ['wav', 8000];
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
57
lib/utils.js
57
lib/utils.js
@@ -6,7 +6,7 @@ const pool = new Pool('https://auth.crt.nuance.com');
|
||||
const NUANCE_AUTH_ENDPOINT = 'tts.api.nuance.com:443';
|
||||
const grpc = require('@grpc/grpc-js');
|
||||
const formurlencoded = require('form-urlencoded');
|
||||
const { JAMBONES_DISABLE_TTS_STREAMING, JAMBONES_TTS_TRIM_SILENCE, TMP_FOLDER, HTTP_TIMEOUT } = require('./config');
|
||||
const { TMP_FOLDER, HTTP_TIMEOUT } = require('./config');
|
||||
|
||||
const debug = require('debug')('jambonz:realtimedb-helpers');
|
||||
/**
|
||||
@@ -17,68 +17,19 @@ const debug = require('debug')('jambonz:realtimedb-helpers');
|
||||
//const nuanceClientMap = new Map();
|
||||
|
||||
function makeSynthKey({
|
||||
account_sid = '', vendor, language, voice, engine = '', text,
|
||||
renderForCaching = false}) {
|
||||
account_sid = '', vendor, language, voice, engine = '', text}) {
|
||||
const hash = crypto.createHash('sha1');
|
||||
hash.update(`${language}:${vendor}:${voice}:${engine}:${text}`);
|
||||
const hexHashKey = hash.digest('hex');
|
||||
const accountKey = account_sid ? `:${account_sid}` : '';
|
||||
const namespace = vendor.startsWith('custom') ? vendor : getFileExtension({vendor, voice, renderForCaching});
|
||||
const key = `tts${accountKey}:${namespace}:${hexHashKey}`;
|
||||
const key = `tts${accountKey}:${hexHashKey}`;
|
||||
return key;
|
||||
}
|
||||
|
||||
function makeFilePath({vendor, voice, key, salt = '', renderForCaching = false}) {
|
||||
const extension = getFileExtension({vendor, renderForCaching, voice});
|
||||
function makeFilePath({key, salt = '', extension}) {
|
||||
return `${TMP_FOLDER}/${key.replace('tts:', `tts-${salt}`)}.${extension}`;
|
||||
}
|
||||
|
||||
function getFileExtension({vendor, voice, renderForCaching = false}) {
|
||||
const mp3Extension = 'mp3';
|
||||
const r8Extension = 'r8';
|
||||
const wavExtension = 'wav';
|
||||
|
||||
switch (vendor) {
|
||||
case 'azure':
|
||||
case 'microsoft':
|
||||
if (!renderForCaching && !JAMBONES_DISABLE_TTS_STREAMING || JAMBONES_TTS_TRIM_SILENCE) {
|
||||
return r8Extension;
|
||||
} else {
|
||||
return mp3Extension;
|
||||
}
|
||||
case 'deepgram':
|
||||
case 'elevenlabs':
|
||||
case 'rimelabs':
|
||||
case 'playht':
|
||||
case 'cartesia':
|
||||
if (renderForCaching || JAMBONES_DISABLE_TTS_STREAMING) {
|
||||
return mp3Extension;
|
||||
} else {
|
||||
return r8Extension;
|
||||
}
|
||||
case 'nuance':
|
||||
case 'nvidia':
|
||||
case 'verbio':
|
||||
return r8Extension;
|
||||
case 'google':
|
||||
// google voice cloning just support wav.
|
||||
if (typeof voice === 'object' && voice.voice_cloning_key) {
|
||||
return wavExtension;
|
||||
} else {
|
||||
return mp3Extension;
|
||||
}
|
||||
default:
|
||||
// If vendor is custom
|
||||
if (vendor.startsWith('custom')) {
|
||||
if (renderForCaching || JAMBONES_DISABLE_TTS_STREAMING) {
|
||||
return mp3Extension;
|
||||
} else {
|
||||
return r8Extension;
|
||||
}
|
||||
}
|
||||
return mp3Extension;
|
||||
}
|
||||
}
|
||||
|
||||
const noopLogger = {
|
||||
info: () => {},
|
||||
|
||||
@@ -888,24 +888,15 @@ test('TTS Cache tests', async(t) => {
|
||||
language: 'non-existing',
|
||||
voice: 'non-existing',
|
||||
});
|
||||
t.ok(purgedCountWhenErrored === 0, `purged no records when specified key was not found`);
|
||||
t.ok(error, `error returned when specified key was not found`);
|
||||
t.ok(purgedCountWhenErrored === 0, 'purged no records when specified key was not found');
|
||||
t.ok(error, 'error returned when specified key was not found');
|
||||
|
||||
// make sure other tts keys are still there
|
||||
const cached = await client.keys('tts:*')
|
||||
t.ok(cached.length >= 1, `successfully kept all non-specified tts records in cache`);
|
||||
const cached = await client.keys('tts:*');
|
||||
t.ok(cached.length >= 1, 'successfully kept all non-specified tts records in cache');
|
||||
|
||||
// retrieve keys from cache and check the key contains the file extension
|
||||
let key = cached[0];
|
||||
t.ok(key.includes('mp3'), `tts cache extension shoult be part of the key and equal mp3`);
|
||||
|
||||
process.env.VG_TRIM_TTS_SILENCE = 'true';
|
||||
process.env.VG_TRIM_TTS_SILENCE = 'true';
|
||||
await client.set(makeSynthKey({ vendor: 'azure' }), 'value');
|
||||
|
||||
const r8Keys = await client.keys('tts:r8*');
|
||||
key = r8Keys[0];
|
||||
t.ok(key.includes('r8'), `tts cache extension shoult be part of the key and equal r8`);
|
||||
|
||||
} catch (err) {
|
||||
console.error(JSON.stringify(err));
|
||||
t.end(err);
|
||||
|
||||
Reference in New Issue
Block a user