remove audio extension from audio key

This commit is contained in:
Quan HL
2024-12-18 16:46:24 +07:00
parent e5b5e3d0c6
commit 7327190471
4 changed files with 196 additions and 194 deletions

View File

@@ -3,6 +3,27 @@ const {noopLogger, makeSynthKey} = require('./utils');
const {JAMBONES_TTS_CACHE_DURATION_MINS} = require('./config');
const EXPIRES = JAMBONES_TTS_CACHE_DURATION_MINS;
function getExtensionAndSampleRate(path) {
const match = path.match(/\.([^.]*)$/);
if (!match) {
//default should be wav file.
return ['wav', 8000];
}
const extension = match[1];
const sampleRateMap = {
r8: 8000,
r16: 16000,
r24: 24000,
r44: 44100,
r48: 48000,
r96: 96000,
};
const sampleRate = sampleRateMap[extension] || 8000;
return [extension, sampleRate];
}
async function addFileToCache(client, logger, path,
{account_sid, vendor, language, voice, deploymentId, engine, text}) {
let key;
@@ -17,8 +38,15 @@ async function addFileToCache(client, logger, path,
engine,
text,
});
const [extension, sampleRate] = getExtensionAndSampleRate(path);
const audioBuffer = await fs.readFile(path);
await client.setex(key, EXPIRES, audioBuffer.toString('base64'));
await client.setex(key, EXPIRES, JSON.stringify(
{
audioContent: audioBuffer.toString('base64'),
extension,
sampleRate
}
));
} catch (err) {
logger.error(err, 'addFileToCache: Error');
return;

View File

@@ -46,7 +46,7 @@ const {
JAMBONES_HTTP_PROXY_IP,
JAMBONES_HTTP_PROXY_PORT,
JAMBONES_TTS_CACHE_DURATION_MINS,
JAMBONES_EAGERLY_PRE_CACHE_AUDIO,
JAMBONES_TTS_TRIM_SILENCE,
} = require('./config');
const EXPIRES = JAMBONES_TTS_CACHE_DURATION_MINS;
const OpenAI = require('openai');
@@ -91,7 +91,7 @@ async function synthAudio(client, createHash, retrieveHash, logger, stats, { acc
vendor, language, voice, gender, text, engine, salt, model, credentials, deploymentId,
disableTtsCache, renderForCaching = false, disableTtsStreaming, options
}) {
let audioBuffer;
let audioData;
let servedFromCache = false;
let rtt;
logger = logger || noopLogger;
@@ -172,54 +172,22 @@ async function synthAudio(client, createHash, retrieveHash, logger, stats, { acc
text,
renderForCaching
});
let filePath;
// used only for custom vendor
let fileExtension;
filePath = makeFilePath({vendor, voice, key, salt, renderForCaching});
debug(`synth key is ${key}`);
let cached;
if (!disableTtsCache) {
cached = await client.get(key);
/**
* If we are using tts streaming and also precaching audio, audio could have been cached by streaming (r8)
* or here in speech-utils due to precaching (mp3), so we need to check for both keys.
*/
if (!cached && JAMBONES_EAGERLY_PRE_CACHE_AUDIO) {
const preCachekey = makeSynthKey({
account_sid,
vendor,
language: language || '',
voice: voice || deploymentId,
engine,
text,
renderForCaching: true
});
cached = await client.get(preCachekey);
if (cached) {
// Precache audio is available update filpath with precache file extension.
filePath = makeFilePath({vendor, voice, key, salt, renderForCaching: true});
}
}
}
if (cached) {
// found in cache - extend the expiry and use it
debug('result WAS found in cache');
servedFromCache = true;
stats.increment('tts.cache.requests', ['found:yes']);
if (vendor.startsWith('custom')) {
// custom vendors support multiple mime types such as: mp3, wav, r8, r16 ...etc,
// mime type/file extension is available when http response has header Content-type.
// In cache, file extension is store together with audiBuffer in a json.
// Normal cache audio will be base64 string
const payload = JSON.parse(cached);
filePath = filePath.replace(/\.[^\.]*$/g, payload.fileExtension);
audioBuffer = Buffer.from(payload.audioBuffer, 'base64');
} else {
audioBuffer = Buffer.from(cached, 'base64');
}
audioData = JSON.parse(cached);
// convert base64 audio to buffer
audioData.audioContent = Buffer.from(audioData.audioContent, 'base64');
client.expire(key, EXPIRES).catch((err) => logger.info(err, 'Error setting expires'));
}
if (!cached) {
} else {
// not found in cache - go get it from speech vendor and add to cache
debug('result was NOT found in cache');
stats.increment('tts.cache.requests', ['found:no']);
@@ -227,94 +195,92 @@ async function synthAudio(client, createHash, retrieveHash, logger, stats, { acc
const startAt = process.hrtime();
switch (vendor) {
case 'google':
audioBuffer = await synthGoogle(logger, {credentials, stats, language, voice, gender, text});
audioData = await synthGoogle(logger, {credentials, stats, language, voice, gender, text});
break;
case 'aws':
case 'polly':
vendorLabel = 'aws';
audioBuffer = await synthPolly(createHash, retrieveHash, logger,
audioData = await synthPolly(createHash, retrieveHash, logger,
{credentials, stats, language, voice, text, engine});
break;
case 'azure':
case 'microsoft':
vendorLabel = 'microsoft';
audioBuffer = await synthMicrosoft(logger, {credentials, stats, language, voice, text, deploymentId,
filePath, renderForCaching, disableTtsStreaming});
audioData = await synthMicrosoft(logger, {credentials, stats, language, voice, text, deploymentId,
renderForCaching, disableTtsStreaming});
break;
case 'nuance':
model = model || 'enhanced';
audioBuffer = await synthNuance(client, logger, {credentials, stats, voice, model, text});
audioData = await synthNuance(client, logger, {credentials, stats, voice, model, text});
break;
case 'nvidia':
audioBuffer = await synthNvidia(client, logger, {credentials, stats, language, voice, model, text});
audioData = await synthNvidia(client, logger, {credentials, stats, language, voice, model, text});
break;
case 'ibm':
audioBuffer = await synthIbm(logger, {credentials, stats, voice, text});
audioData = await synthIbm(logger, {credentials, stats, voice, text});
break;
case 'wellsaid':
audioBuffer = await synthWellSaid(logger, {credentials, stats, language, voice, text, filePath});
audioData = await synthWellSaid(logger, {credentials, stats, language, voice, text});
break;
case 'elevenlabs':
audioBuffer = await synthElevenlabs(logger, {
credentials, options, stats, language, voice, text, renderForCaching, disableTtsStreaming, filePath
});
audioData = await synthElevenlabs(logger, {
credentials, options, stats, language, voice, text, renderForCaching, disableTtsStreaming});
break;
case 'playht':
audioBuffer = await synthPlayHT(client, logger, {
credentials, options, stats, language, voice, text, renderForCaching, disableTtsStreaming, filePath
});
audioData = await synthPlayHT(client, logger, {
credentials, options, stats, language, voice, text, renderForCaching, disableTtsStreaming});
break;
case 'cartesia':
audioBuffer = await synthCartesia(logger, {
credentials, options, stats, language, voice, text, renderForCaching, disableTtsStreaming, filePath
});
audioData = await synthCartesia(logger, {
credentials, options, stats, language, voice, text, renderForCaching, disableTtsStreaming});
break;
case 'rimelabs':
audioBuffer = await synthRimelabs(logger, {
credentials, options, stats, language, voice, text, renderForCaching, disableTtsStreaming, filePath
});
audioData = await synthRimelabs(logger, {
credentials, options, stats, language, voice, text, renderForCaching, disableTtsStreaming});
break;
case 'whisper':
audioBuffer = await synthWhisper(logger, {
audioData = await synthWhisper(logger, {
credentials, stats, voice, text, renderForCaching, disableTtsStreaming});
break;
case 'verbio':
audioBuffer = await synthVerbio(client, logger, {
audioData = await synthVerbio(client, logger, {
credentials, stats, voice, text, renderForCaching, disableTtsStreaming});
if (audioBuffer?.filePath) return audioBuffer;
if (audioData?.filePath) return audioData;
break;
case 'deepgram':
audioBuffer = await synthDeepgram(logger, {credentials, stats, model, text,
audioData = await synthDeepgram(logger, {credentials, stats, model, text,
renderForCaching, disableTtsStreaming});
break;
case vendor.startsWith('custom') ? vendor : 'cant_match_value':
({ audioBuffer, filePath, fileExtension } = await synthCustomVendor(logger,
{credentials, stats, language, voice, text, filePath}));
audioData = await synthCustomVendor(logger,
{credentials, stats, language, voice, text});
break;
default:
assert(`synthAudio: unsupported speech vendor ${vendor}`);
}
if ('filePath' in audioBuffer) return audioBuffer;
if ('filePath' in audioData) return audioData;
const diff = process.hrtime(startAt);
const time = diff[0] * 1e3 + diff[1] * 1e-6;
rtt = time.toFixed(0);
stats.histogram('tts.response_time', rtt, [`vendor:${vendorLabel}`]);
debug(`tts rtt time for ${text.length} chars on ${vendorLabel}: ${rtt}`);
logger.info(`tts rtt time for ${text.length} chars on ${vendorLabel}: ${rtt}`);
const base64Audio = audioBuffer.toString('base64');
const cacheContent = vendor.startsWith('custom') ?
JSON.stringify({
audioBuffer: base64Audio,
fileExtension
}) : base64Audio;
client.setex(key, EXPIRES, cacheContent)
// Save audio json to cache
client.setex(key, EXPIRES, JSON.stringify({
...audioData,
audioContent: audioData.audioContent?.toString('base64')
}))
.catch((err) => logger.error(err, `error calling setex on key ${key}`));
}
return new Promise((resolve, reject) => {
fs.writeFile(filePath, audioBuffer, (err) => {
const { audioContent, extension } = audioData;
const filePath = makeFilePath({
key,
salt,
extension
});
fs.writeFile(filePath, audioContent, (err) => {
if (err) return reject(err);
resolve({filePath, servedFromCache, rtt});
});
@@ -369,7 +335,13 @@ const synthPolly = async(createHash, retrieveHash, logger,
.on('data', (chunk) => {
chunks.push(chunk);
})
.on('end', () => resolve(Buffer.concat(chunks)));
.on('end', () => resolve(
{
audioContent: Buffer.concat(chunks),
extension: 'mp3',
sampleRate: 8000
}
));
});
} catch (err) {
logger.info({err}, 'synthAudio: Error synthesizing speech using aws polly');
@@ -411,7 +383,11 @@ const synthGoogle = async(logger, {credentials, stats, language, voice, gender,
};
const wav = await post('/v1beta1/text:synthesize', payload);
return Buffer.from(wav.audioContent, 'base64');
return {
audioContent: Buffer.from(wav.audioContent, 'base64'),
extension: 'wav',
sampleRate: 24000
};
} catch (err) {
logger.info({err: await err.text()}, 'synthGoogle returned error');
throw err;
@@ -432,7 +408,11 @@ const synthGoogle = async(logger, {credentials, stats, language, voice, gender,
const responses = await client.synthesizeSpeech(opts);
stats.increment('tts.count', ['vendor:google', 'accepted:yes']);
client.close();
return responses[0].audioContent;
return {
audioContent: responses[0].audioContent,
extension: 'mp3',
sampleRate: 8000
};
} catch (err) {
console.error(err);
logger.info({err, opts}, 'synthAudio: Error synthesizing speech using google');
@@ -463,7 +443,11 @@ const synthIbm = async(logger, {credentials, stats, voice, text}) => {
for await (const chunk of r.result) {
chunks.push(chunk);
}
return Buffer.concat(chunks);
return {
audioContent: Buffer.concat(chunks),
extension: 'mp3',
sampleRate: 8000
};
} catch (err) {
logger.info({err, params}, 'synthAudio: Error synthesizing speech using ibm');
stats.increment('tts.count', ['vendor:ibm', 'accepted:no']);
@@ -473,11 +457,9 @@ const synthIbm = async(logger, {credentials, stats, voice, text}) => {
async function _synthOnPremMicrosoft(logger, {
credentials,
stats,
language,
voice,
text,
filePath
text
}) {
const {use_custom_tts, custom_tts_endpoint_url, api_key} = credentials;
let content = text;
@@ -499,15 +481,19 @@ async function _synthOnPremMicrosoft(logger, {
}
try {
const trimSilence = filePath.endsWith('.r8');
const trimSilence = JAMBONES_TTS_TRIM_SILENCE;
const post = bent('POST', 'buffer', {
'X-Microsoft-OutputFormat': trimSilence ? 'raw-8khz-16bit-mono-pcm' : 'audio-16khz-32kbitrate-mono-mp3',
'Content-Type': 'application/ssml+xml',
'User-Agent': 'Jambonz',
...(api_key && {'Ocp-Apim-Subscription-Key': api_key})
});
const mp3 = await post(custom_tts_endpoint_url, content);
return mp3;
const audioContent = await post(custom_tts_endpoint_url, content);
return {
audioContent,
extension: trimSilence ? 'r8' : 'mp3',
sampleRate: 8000
};
} catch (err) {
logger.info({err}, '_synthMicrosoftByHttp returned error');
throw err;
@@ -520,7 +506,6 @@ const synthMicrosoft = async(logger, {
language,
voice,
text,
filePath,
renderForCaching,
disableTtsStreaming
}) => {
@@ -563,17 +548,18 @@ const synthMicrosoft = async(logger, {
rtt: 0
};
}
// Azure Onprem
if (use_custom_tts && custom_tts_endpoint_url) {
return await _synthOnPremMicrosoft(logger, {
credentials,
stats,
language,
voice,
text,
filePath
text
});
}
const trimSilence = filePath.endsWith('.r8');
// Azure hosted service
const trimSilence = JAMBONES_TTS_TRIM_SILENCE;
const speechConfig = SpeechConfig.fromSubscription(apiKey, region);
speechConfig.speechSynthesisLanguage = language;
speechConfig.speechSynthesisVoiceName = voice;
@@ -608,7 +594,11 @@ const synthMicrosoft = async(logger, {
case ResultReason.SynthesizingAudioCompleted:
let buffer = Buffer.from(result.audioData);
if (trimSilence) buffer = trimTrailingSilence(buffer);
resolve(buffer);
resolve({
audioContent: buffer,
extension: trimSilence ? 'r8' : 'mp3',
sampleRate: 8000
});
synthesizer.close();
stats.increment('tts.count', ['vendor:microsoft', 'accepted:yes']);
break;
@@ -638,11 +628,15 @@ const synthWellSaid = async(logger, {credentials, stats, language, voice, gender
'Accept': 'audio/mpeg',
'Content-Type': 'application/json'
});
const mp3 = await post('/v1/tts/stream', {
const audioContent = await post('/v1/tts/stream', {
text,
speaker_id: voice
});
return mp3;
return {
audioContent,
extension: 'mp3',
sampleRate: 8000
};
} catch (err) {
logger.info({err}, 'testWellSaidTts returned error');
throw err;
@@ -679,8 +673,8 @@ const synthNuance = async(client, logger, {credentials, stats, voice, model, tex
t.setText(text);
input.setText(t);
}
pcm.setSampleRateHz(8000);
const sampleRate = 8000;
pcm.setSampleRateHz(sampleRate);
f.setPcm(pcm);
p.setAudioFormat(f);
v.setName(voice);
@@ -704,7 +698,11 @@ const synthNuance = async(client, logger, {credentials, stats, voice, model, tex
const details = status.getDetails();
return reject({code, message, details});
}
resolve(Buffer.from(response.getAudio()));
resolve({
audioContent: Buffer.from(response.getAudio()),
extension: 'r8',
sampleRate
});
});
});
};
@@ -712,12 +710,13 @@ const synthNuance = async(client, logger, {credentials, stats, voice, model, tex
const synthNvidia = async(client, logger, {credentials, stats, language, voice, model, text}) => {
const {riva_server_uri} = credentials;
let rivaClient, request;
const sampleRate = 8000;
try {
rivaClient = await createRivaClient(riva_server_uri);
request = new SynthesizeSpeechRequest();
request.setVoiceName(voice);
request.setLanguageCode(language);
request.setSampleRateHz(8000);
request.setSampleRateHz(sampleRate);
request.setEncoding(AudioEncoding.LINEAR_PCM);
request.setText(text);
} catch (err) {
@@ -731,7 +730,11 @@ const synthNvidia = async(client, logger, {credentials, stats, language, voice,
logger.info({err, voice, language}, 'error synthesizing speech using Nvidia');
return reject(err);
}
resolve(Buffer.from(response.getAudio()));
resolve({
audioContent: Buffer.from(response.getAudio()),
extension: 'r8',
sampleRate
});
});
});
};
@@ -753,14 +756,13 @@ const synthCustomVendor = async(logger, {credentials, stats, language, voice, te
text
});
const regex = /\.[^\.]*$/g;
const mime = response.headers['content-type'];
const buffer = await response.arrayBuffer();
const fileExtension = getFileExtFromMime(mime);
const [extension, sampleRate] = getFileExtFromMime(mime);
return {
audioBuffer: buffer,
filePath: filePath.replace(regex, fileExtension),
fileExtension
audioContent: buffer,
extension,
sampleRate
};
} catch (err) {
logger.info({err}, `Vendor ${vendor} returned error`);
@@ -806,7 +808,7 @@ const synthElevenlabs = async(logger, {
'Accept': 'audio/mpeg',
'Content-Type': 'application/json'
});
const mp3 = await post(`/v1/text-to-speech/${voice}${optimize_streaming_latency}`, {
const audioContent = await post(`/v1/text-to-speech/${voice}${optimize_streaming_latency}`, {
text,
model_id,
voice_settings: {
@@ -815,7 +817,11 @@ const synthElevenlabs = async(logger, {
},
...opts
});
return mp3;
return {
audioContent,
extension: 'mp3',
sampleRate: 8000
};
} catch (err) {
logger.info({err}, 'synth Elevenlabs returned error');
stats.increment('tts.count', ['vendor:elevenlabs', 'accepted:no']);
@@ -897,7 +903,7 @@ const synthPlayHT = async(client, logger, {
'Content-Type': 'application/json'
});
const mp3 = await post(synthesizeUrl, {
const audioContent = await post(synthesizeUrl, {
text,
...(voice_engine === 'Play3.0' && { language }),
voice,
@@ -906,7 +912,11 @@ const synthPlayHT = async(client, logger, {
sample_rate: 8000,
...opts
});
return mp3;
return {
audioContent,
extension: 'mp3',
sampleRate: 8000
};
} catch (err) {
logger.info({err}, 'synth PlayHT returned error');
stats.increment('tts.count', ['vendor:playht', 'accepted:no']);
@@ -945,14 +955,19 @@ const synthRimelabs = async(logger, {
'Accept': 'audio/mp3',
'Content-Type': 'application/json'
});
const mp3 = await post('/v1/rime-tts', {
const sampleRate = 8000;
const audioContent = await post('/v1/rime-tts', {
speaker: voice,
text,
modelId: model_id,
samplingRate: 8000,
samplingRate: sampleRate,
...opts
});
return mp3;
return {
audioContent,
extension: 'mp3',
sampleRate
};
} catch (err) {
logger.info({err}, 'synth rimelabs returned error');
stats.increment('tts.count', ['vendor:rimelabs', 'accepted:no']);
@@ -986,13 +1001,17 @@ const synthVerbio = async(client, logger, {credentials, stats, voice, text, rend
'User-Agent': 'jambonz',
'Content-Type': 'application/json'
});
const r8 = await post('/api/v1/synthesize', {
const audioContent = await post('/api/v1/synthesize', {
voice_id: voice,
output_sample_rate: '8k',
output_encoding: 'pcm16',
text
});
return r8;
return {
audioContent,
extension: 'r8',
sampleRate: 8000
};
} catch (err) {
logger.info({err}, 'synth Verbio returned error');
stats.increment('tts.count', ['vendor:verbio', 'accepted:no']);
@@ -1032,7 +1051,11 @@ const synthWhisper = async(logger, {credentials, stats, voice, text, renderForCa
input: text,
response_format: 'mp3'
});
return Buffer.from(await mp3.arrayBuffer());
return {
audioContent: Buffer.from(await mp3.arrayBuffer()),
extension: 'mp3',
sampleRate: 8000
};
} catch (err) {
logger.info({err}, 'synth whisper returned error');
stats.increment('tts.count', ['vendor:openai', 'accepted:no']);
@@ -1064,10 +1087,14 @@ const synthDeepgram = async(logger, {credentials, stats, model, text, renderForC
'Accept': 'audio/mpeg',
'Content-Type': 'application/json'
});
const mp3 = await post(`/v1/speak?model=${model}`, {
const audioContent = await post(`/v1/speak?model=${model}`, {
text
});
return mp3;
return {
audioContent,
extension: 'mp3',
sampleRate: 8000
};
} catch (err) {
logger.info({err}, 'synth Deepgram returned error');
stats.increment('tts.count', ['vendor:deepgram', 'accepted:no']);
@@ -1104,6 +1131,7 @@ const synthCartesia = async(logger, {
try {
const client = new CartesiaClient({ apiKey: api_key });
const sampleRate = 48000;
const mp3 = await client.tts.bytes({
modelId: model_id,
transcript: text,
@@ -1119,7 +1147,7 @@ const synthCartesia = async(logger, {
),
...(opts.speed || opts.emotion && {
experimentalControls: {
...(opts.speed && {speed: opts.speed}),
...(opts.speed !== null && opts.speed !== undefined && {speed: opts.speed}),
...(opts.emotion && {emotion: opts.emotion}),
}
})
@@ -1128,10 +1156,14 @@ const synthCartesia = async(logger, {
outputFormat: {
container: 'mp3',
bitRate: 128000,
sampleRate: 8000
sampleRate
},
});
return Buffer.from(mp3);
return {
audioContent: Buffer.from(mp3),
extension: 'mp3',
sampleRate
};
} catch (err) {
logger.info({err}, 'synth Cartesia returned error');
stats.increment('tts.count', ['vendor:cartesia', 'accepted:no']);
@@ -1144,22 +1176,22 @@ const getFileExtFromMime = (mime) => {
switch (mime) {
case 'audio/wav':
case 'audio/x-wav':
return '.wav';
return ['wav', 8000];
case /audio\/l16.*rate=8000/.test(mime) ? mime : 'cant match value':
return '.r8';
return ['r8', 8000];
case /audio\/l16.*rate=16000/.test(mime) ? mime : 'cant match value':
return '.r16';
return ['r16', 16000];
case /audio\/l16.*rate=24000/.test(mime) ? mime : 'cant match value':
return '.r24';
return ['r24', 24000];
case /audio\/l16.*rate=32000/.test(mime) ? mime : 'cant match value':
return '.r32';
return ['r32', 32000];
case /audio\/l16.*rate=48000/.test(mime) ? mime : 'cant match value':
return '.r48';
return ['r48', 48000];
case 'audio/mpeg':
case 'audio/mp3':
return '.mp3';
return ['mp3', 8000];
default:
return '.wav';
return ['wav', 8000];
}
};

View File

@@ -6,7 +6,7 @@ const pool = new Pool('https://auth.crt.nuance.com');
const NUANCE_AUTH_ENDPOINT = 'tts.api.nuance.com:443';
const grpc = require('@grpc/grpc-js');
const formurlencoded = require('form-urlencoded');
const { JAMBONES_DISABLE_TTS_STREAMING, JAMBONES_TTS_TRIM_SILENCE, TMP_FOLDER, HTTP_TIMEOUT } = require('./config');
const { TMP_FOLDER, HTTP_TIMEOUT } = require('./config');
const debug = require('debug')('jambonz:realtimedb-helpers');
/**
@@ -17,68 +17,19 @@ const debug = require('debug')('jambonz:realtimedb-helpers');
//const nuanceClientMap = new Map();
function makeSynthKey({
account_sid = '', vendor, language, voice, engine = '', text,
renderForCaching = false}) {
account_sid = '', vendor, language, voice, engine = '', text}) {
const hash = crypto.createHash('sha1');
hash.update(`${language}:${vendor}:${voice}:${engine}:${text}`);
const hexHashKey = hash.digest('hex');
const accountKey = account_sid ? `:${account_sid}` : '';
const namespace = vendor.startsWith('custom') ? vendor : getFileExtension({vendor, voice, renderForCaching});
const key = `tts${accountKey}:${namespace}:${hexHashKey}`;
const key = `tts${accountKey}:${hexHashKey}`;
return key;
}
function makeFilePath({vendor, voice, key, salt = '', renderForCaching = false}) {
const extension = getFileExtension({vendor, renderForCaching, voice});
function makeFilePath({key, salt = '', extension}) {
return `${TMP_FOLDER}/${key.replace('tts:', `tts-${salt}`)}.${extension}`;
}
function getFileExtension({vendor, voice, renderForCaching = false}) {
const mp3Extension = 'mp3';
const r8Extension = 'r8';
const wavExtension = 'wav';
switch (vendor) {
case 'azure':
case 'microsoft':
if (!renderForCaching && !JAMBONES_DISABLE_TTS_STREAMING || JAMBONES_TTS_TRIM_SILENCE) {
return r8Extension;
} else {
return mp3Extension;
}
case 'deepgram':
case 'elevenlabs':
case 'rimelabs':
case 'playht':
case 'cartesia':
if (renderForCaching || JAMBONES_DISABLE_TTS_STREAMING) {
return mp3Extension;
} else {
return r8Extension;
}
case 'nuance':
case 'nvidia':
case 'verbio':
return r8Extension;
case 'google':
// google voice cloning just support wav.
if (typeof voice === 'object' && voice.voice_cloning_key) {
return wavExtension;
} else {
return mp3Extension;
}
default:
// If vendor is custom
if (vendor.startsWith('custom')) {
if (renderForCaching || JAMBONES_DISABLE_TTS_STREAMING) {
return mp3Extension;
} else {
return r8Extension;
}
}
return mp3Extension;
}
}
const noopLogger = {
info: () => {},

View File

@@ -888,24 +888,15 @@ test('TTS Cache tests', async(t) => {
language: 'non-existing',
voice: 'non-existing',
});
t.ok(purgedCountWhenErrored === 0, `purged no records when specified key was not found`);
t.ok(error, `error returned when specified key was not found`);
t.ok(purgedCountWhenErrored === 0, 'purged no records when specified key was not found');
t.ok(error, 'error returned when specified key was not found');
// make sure other tts keys are still there
const cached = await client.keys('tts:*')
t.ok(cached.length >= 1, `successfully kept all non-specified tts records in cache`);
const cached = await client.keys('tts:*');
t.ok(cached.length >= 1, 'successfully kept all non-specified tts records in cache');
// retrieve keys from cache and check the key contains the file extension
let key = cached[0];
t.ok(key.includes('mp3'), `tts cache extension shoult be part of the key and equal mp3`);
process.env.VG_TRIM_TTS_SILENCE = 'true';
process.env.VG_TRIM_TTS_SILENCE = 'true';
await client.set(makeSynthKey({ vendor: 'azure' }), 'value');
const r8Keys = await client.keys('tts:r8*');
key = r8Keys[0];
t.ok(key.includes('r8'), `tts cache extension shoult be part of the key and equal r8`);
} catch (err) {
console.error(JSON.stringify(err));
t.end(err);