mirror of
https://github.com/jambonz/speech-utils.git
synced 2025-12-19 03:37:49 +00:00
Merge pull request #130 from jambonz/feat/disableTtsCache
set write_cache_file = 0 when disableTtsCache
This commit is contained in:
@@ -210,13 +210,14 @@ async function synthAudio(client, createHash, retrieveHash, logger, stats, { acc
|
||||
case 'polly':
|
||||
vendorLabel = 'aws';
|
||||
audioData = await synthPolly(createHash, retrieveHash, logger,
|
||||
{credentials, stats, language, voice, key, text, engine, renderForCaching, disableTtsStreaming});
|
||||
{credentials, stats, language, voice, key, text, engine, renderForCaching, disableTtsStreaming,
|
||||
disableTtsCache});
|
||||
break;
|
||||
case 'azure':
|
||||
case 'microsoft':
|
||||
vendorLabel = 'microsoft';
|
||||
audioData = await synthMicrosoft(logger, {credentials, stats, language, voice, key, text, deploymentId,
|
||||
renderForCaching, disableTtsStreaming});
|
||||
renderForCaching, disableTtsStreaming, disableTtsCache});
|
||||
break;
|
||||
case 'nuance':
|
||||
model = model || 'enhanced';
|
||||
@@ -224,7 +225,7 @@ async function synthAudio(client, createHash, retrieveHash, logger, stats, { acc
|
||||
break;
|
||||
case 'nvidia':
|
||||
audioData = await synthNvidia(client, logger, {credentials, stats, language, voice, model, key, text,
|
||||
renderForCaching, disableTtsStreaming});
|
||||
renderForCaching, disableTtsStreaming, disableTtsCache});
|
||||
break;
|
||||
case 'ibm':
|
||||
audioData = await synthIbm(logger, {credentials, stats, voice, key, text});
|
||||
@@ -234,44 +235,50 @@ async function synthAudio(client, createHash, retrieveHash, logger, stats, { acc
|
||||
break;
|
||||
case 'elevenlabs':
|
||||
audioData = await synthElevenlabs(logger, {
|
||||
credentials, options, stats, language, voice, key, text, renderForCaching, disableTtsStreaming});
|
||||
credentials, options, stats, language, voice, key, text, renderForCaching, disableTtsStreaming,
|
||||
disableTtsCache});
|
||||
break;
|
||||
case 'playht':
|
||||
audioData = await synthPlayHT(client, logger, {
|
||||
credentials, options, stats, language, voice, key, text, renderForCaching, disableTtsStreaming});
|
||||
credentials, options, stats, language, voice, key, text, renderForCaching, disableTtsStreaming,
|
||||
disableTtsCache});
|
||||
break;
|
||||
case 'cartesia':
|
||||
audioData = await synthCartesia(logger, {
|
||||
credentials, options, stats, language, voice, key, text, renderForCaching, disableTtsStreaming});
|
||||
credentials, options, stats, language, voice, key, text, renderForCaching, disableTtsStreaming,
|
||||
disableTtsCache});
|
||||
break;
|
||||
case 'inworld':
|
||||
audioData = await synthInworld(logger, {
|
||||
credentials, options, stats, language, voice, key, text, renderForCaching, disableTtsStreaming});
|
||||
credentials, options, stats, language, voice, key, text, renderForCaching, disableTtsStreaming,
|
||||
disableTtsCache});
|
||||
break;
|
||||
case 'rimelabs':
|
||||
audioData = await synthRimelabs(logger, {
|
||||
credentials, options, stats, language, voice, key, text, renderForCaching, disableTtsStreaming});
|
||||
credentials, options, stats, language, voice, key, text, renderForCaching, disableTtsStreaming,
|
||||
disableTtsCache});
|
||||
break;
|
||||
case 'whisper':
|
||||
audioData = await synthWhisper(logger, {
|
||||
credentials, stats, voice, key, text, instructions, renderForCaching, disableTtsStreaming});
|
||||
credentials, stats, voice, key, text, instructions, renderForCaching, disableTtsStreaming,
|
||||
disableTtsCache});
|
||||
break;
|
||||
case 'verbio':
|
||||
audioData = await synthVerbio(client, logger, {
|
||||
credentials, stats, voice, key, text, renderForCaching, disableTtsStreaming});
|
||||
credentials, stats, voice, key, text, renderForCaching, disableTtsStreaming, disableTtsCache});
|
||||
if (audioData?.filePath) return audioData;
|
||||
break;
|
||||
case 'deepgram':
|
||||
audioData = await synthDeepgram(logger, {credentials, stats, model, key, text,
|
||||
renderForCaching, disableTtsStreaming});
|
||||
renderForCaching, disableTtsStreaming, disableTtsCache});
|
||||
break;
|
||||
case 'resemble':
|
||||
audioData = await synthResemble(logger, {
|
||||
credentials, stats, voice, key, text, options, renderForCaching, disableTtsStreaming});
|
||||
credentials, stats, voice, key, text, options, renderForCaching, disableTtsStreaming, disableTtsCache});
|
||||
break;
|
||||
case vendor.startsWith('custom') ? vendor : 'cant_match_value':
|
||||
audioData = await synthCustomVendor(logger,
|
||||
{credentials, stats, language, voice, key, text, renderForCaching, disableTtsStreaming});
|
||||
{credentials, stats, language, voice, key, text, renderForCaching, disableTtsStreaming, disableTtsCache});
|
||||
break;
|
||||
default:
|
||||
assert(`synthAudio: unsupported speech vendor ${vendor}`);
|
||||
@@ -306,14 +313,14 @@ async function synthAudio(client, createHash, retrieveHash, logger, stats, { acc
|
||||
}
|
||||
|
||||
const synthPolly = async(createHash, retrieveHash, logger,
|
||||
{credentials, stats, language, voice, engine, key, text, renderForCaching, disableTtsStreaming}) => {
|
||||
{credentials, stats, language, voice, engine, key, text, renderForCaching, disableTtsStreaming, disableTtsCache}) => {
|
||||
const {region, accessKeyId, secretAccessKey, roleArn} = credentials;
|
||||
if (!JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) {
|
||||
|
||||
let params = '{';
|
||||
params += `language=${language}`;
|
||||
params += `,playback_id=${key}`;
|
||||
params += ',write_cache_file=1';
|
||||
params += `,write_cache_file=${disableTtsCache ? 0 : 1}`;
|
||||
params += ',vendor=aws';
|
||||
if (accessKeyId && secretAccessKey) {
|
||||
if (accessKeyId) params += `,accessKeyId=${accessKeyId}`;
|
||||
@@ -563,7 +570,8 @@ const synthMicrosoft = async(logger, {
|
||||
key,
|
||||
text,
|
||||
renderForCaching,
|
||||
disableTtsStreaming
|
||||
disableTtsStreaming,
|
||||
disableTtsCache
|
||||
}) => {
|
||||
try {
|
||||
const {api_key: apiKey, region, use_custom_tts, custom_tts_endpoint, custom_tts_endpoint_url} = credentials;
|
||||
@@ -596,7 +604,7 @@ const synthMicrosoft = async(logger, {
|
||||
params += `,language=${language}`;
|
||||
params += ',vendor=microsoft';
|
||||
params += `,voice=${voice}`;
|
||||
params += ',write_cache_file=1';
|
||||
params += `,write_cache_file=${disableTtsCache ? 0 : 1}`;
|
||||
if (region) params += `,region=${region}`;
|
||||
if (custom_tts_endpoint) params += `,endpointId=${custom_tts_endpoint}`;
|
||||
if (custom_tts_endpoint_url) params += `,endpoint=${custom_tts_endpoint_url}`;
|
||||
@@ -769,7 +777,7 @@ const synthNuance = async(client, logger, {credentials, stats, voice, model, tex
|
||||
};
|
||||
|
||||
const synthNvidia = async(client, logger, {
|
||||
credentials, stats, language, voice, model, key, text, renderForCaching, disableTtsStreaming
|
||||
credentials, stats, language, voice, model, key, text, renderForCaching, disableTtsStreaming, disableTtsCache
|
||||
}) => {
|
||||
const {riva_server_uri} = credentials;
|
||||
if (!JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) {
|
||||
@@ -778,7 +786,7 @@ const synthNvidia = async(client, logger, {
|
||||
params += `,playback_id=${key}`;
|
||||
params += `,voice=${voice}`;
|
||||
params += `,language=${language}`;
|
||||
params += ',write_cache_file=1';
|
||||
params += `,write_cache_file=${disableTtsCache ? 0 : 1}`;
|
||||
params += '}';
|
||||
|
||||
return {
|
||||
@@ -819,7 +827,7 @@ const synthNvidia = async(client, logger, {
|
||||
|
||||
|
||||
const synthCustomVendor = async(logger, {credentials, stats, language, voice,
|
||||
text, filePath, renderForCaching, disableTtsStreaming, key}) => {
|
||||
text, filePath, renderForCaching, disableTtsStreaming, key, disableTtsCache}) => {
|
||||
const {vendor, auth_token, custom_tts_url} = credentials;
|
||||
|
||||
if (!JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) {
|
||||
@@ -829,7 +837,7 @@ const synthCustomVendor = async(logger, {credentials, stats, language, voice,
|
||||
params += `,custom_tts_url=${custom_tts_url}`;
|
||||
params += ',vendor=custom';
|
||||
params += `,voice=${voice}`;
|
||||
params += ',write_cache_file=1';
|
||||
params += `,write_cache_file=${disableTtsCache ? 0 : 1}`;
|
||||
params += '}';
|
||||
|
||||
return {
|
||||
@@ -867,7 +875,7 @@ const synthCustomVendor = async(logger, {credentials, stats, language, voice,
|
||||
};
|
||||
|
||||
const synthElevenlabs = async(logger, {
|
||||
credentials, options, stats, voice, key, text, renderForCaching, disableTtsStreaming
|
||||
credentials, options, stats, voice, key, text, renderForCaching, disableTtsStreaming, disableTtsCache
|
||||
}) => {
|
||||
const {api_key, model_id, options: credOpts} = credentials;
|
||||
const opts = !!options && Object.keys(options).length !== 0 ? options : JSON.parse(credOpts || '{}');
|
||||
@@ -880,7 +888,7 @@ const synthElevenlabs = async(logger, {
|
||||
params += ',vendor=elevenlabs';
|
||||
params += `,voice=${voice}`;
|
||||
params += `,model_id=${model_id}`;
|
||||
params += ',write_cache_file=1';
|
||||
params += `,write_cache_file=${disableTtsCache ? 0 : 1}`;
|
||||
if (opts.optimize_streaming_latency !== null && opts.optimize_streaming_latency !== undefined) {
|
||||
params += `,optimize_streaming_latency=${opts.optimize_streaming_latency}`;
|
||||
}
|
||||
@@ -933,7 +941,7 @@ const synthElevenlabs = async(logger, {
|
||||
};
|
||||
|
||||
const synthPlayHT = async(client, logger, {
|
||||
credentials, options, stats, voice, language, key, text, renderForCaching, disableTtsStreaming
|
||||
credentials, options, stats, voice, language, key, text, renderForCaching, disableTtsStreaming, disableTtsCache
|
||||
}) => {
|
||||
const {api_key, user_id, voice_engine, playht_tts_uri, options: credOpts} = credentials;
|
||||
const opts = !!options && Object.keys(options).length !== 0 ? options : JSON.parse(credOpts || '{}');
|
||||
@@ -975,7 +983,7 @@ const synthPlayHT = async(client, logger, {
|
||||
params += `,voice=${voice}`;
|
||||
params += `,voice_engine=${voice_engine}`;
|
||||
params += `,synthesize_url=${synthesizeUrl}`;
|
||||
params += ',write_cache_file=1';
|
||||
params += `,write_cache_file=${disableTtsCache ? 0 : 1}`;
|
||||
params += `,language=${language}`;
|
||||
if (opts.quality) params += `,quality=${opts.quality}`;
|
||||
if (opts.speed) params += `,speed=${opts.speed}`;
|
||||
@@ -1028,7 +1036,7 @@ const synthPlayHT = async(client, logger, {
|
||||
};
|
||||
|
||||
const synthInworld = async(logger, {
|
||||
credentials, options, stats, voice, key, text, renderForCaching, disableTtsStreaming
|
||||
credentials, options, stats, voice, key, text, renderForCaching, disableTtsStreaming, disableTtsCache
|
||||
}) => {
|
||||
const {api_key, model_id, options: credOpts} = credentials;
|
||||
const opts = !!options && Object.keys(options).length !== 0 ? options : JSON.parse(credOpts || '{}');
|
||||
@@ -1041,7 +1049,7 @@ const synthInworld = async(logger, {
|
||||
params += `,model_id=${model_id}`;
|
||||
params += ',vendor=inworld';
|
||||
params += `,voice=${voice}`;
|
||||
params += ',write_cache_file=1';
|
||||
params += `,write_cache_file=${disableTtsCache ? 0 : 1}`;
|
||||
if (opts.temperature) params += `,temperature=${opts.temperature}`;
|
||||
if (opts.audioConfig?.pitch) params += `,pitch=${opts.pitch}`;
|
||||
if (opts.audioConfig?.speakingRate) params += `,speakingRate=${opts.speakingRate}`;
|
||||
@@ -1093,7 +1101,7 @@ const synthInworld = async(logger, {
|
||||
};
|
||||
|
||||
const synthRimelabs = async(logger, {
|
||||
credentials, options, stats, language, voice, key, text, renderForCaching, disableTtsStreaming
|
||||
credentials, options, stats, language, voice, key, text, renderForCaching, disableTtsStreaming, disableTtsCache
|
||||
}) => {
|
||||
const {api_key, model_id, options: credOpts} = credentials;
|
||||
const opts = !!options && Object.keys(options).length !== 0 ? options : JSON.parse(credOpts || '{}');
|
||||
@@ -1107,7 +1115,7 @@ const synthRimelabs = async(logger, {
|
||||
params += ',vendor=rimelabs';
|
||||
params += `,language=${language}`;
|
||||
params += `,voice=${voice}`;
|
||||
params += ',write_cache_file=1';
|
||||
params += `,write_cache_file=${disableTtsCache ? 0 : 1}`;
|
||||
if (opts.speedAlpha) params += `,speed_alpha=${opts.speedAlpha}`;
|
||||
if (opts.reduceLatency) params += `,reduce_latency=${opts.reduceLatency}`;
|
||||
// Arcana model parameters
|
||||
@@ -1151,7 +1159,7 @@ const synthRimelabs = async(logger, {
|
||||
}
|
||||
};
|
||||
const synthVerbio = async(client, logger, {
|
||||
credentials, stats, voice, key, text, renderForCaching, disableTtsStreaming
|
||||
credentials, stats, voice, key, text, renderForCaching, disableTtsStreaming, disableTtsCache
|
||||
}) => {
|
||||
//https://doc.speechcenter.verbio.com/#tag/Text-To-Speech-REST-API
|
||||
if (text.length > 2000) {
|
||||
@@ -1164,7 +1172,7 @@ const synthVerbio = async(client, logger, {
|
||||
params += `,playback_id=${key}`;
|
||||
params += ',vendor=verbio';
|
||||
params += `,voice=${voice}`;
|
||||
params += ',write_cache_file=1';
|
||||
params += `,write_cache_file=${disableTtsCache ? 0 : 1}`;
|
||||
params += '}';
|
||||
|
||||
return {
|
||||
@@ -1199,7 +1207,7 @@ const synthVerbio = async(client, logger, {
|
||||
};
|
||||
|
||||
const synthWhisper = async(logger, {credentials, stats, voice, key, text, instructions,
|
||||
renderForCaching, disableTtsStreaming}) => {
|
||||
renderForCaching, disableTtsStreaming, disableTtsCache}) => {
|
||||
const {api_key, model_id, baseURL, timeout, speed} = credentials;
|
||||
/* if the env is set to stream then bag out, unless we are specifically rendering to generate a cache file */
|
||||
if (!JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) {
|
||||
@@ -1209,7 +1217,7 @@ const synthWhisper = async(logger, {credentials, stats, voice, key, text, instru
|
||||
params += `,model_id=${model_id}`;
|
||||
params += ',vendor=whisper';
|
||||
params += `,voice=${voice}`;
|
||||
params += ',write_cache_file=1';
|
||||
params += `,write_cache_file=${disableTtsCache ? 0 : 1}`;
|
||||
if (speed) params += `,speed=${speed}`;
|
||||
// comma is used to separated parameters in freeswitch tts module
|
||||
if (instructions) params += `,instructions=${instructions.replace(/\n/g, ' ').replace(/,/g, ';')}`;
|
||||
@@ -1247,7 +1255,8 @@ const synthWhisper = async(logger, {credentials, stats, voice, key, text, instru
|
||||
}
|
||||
};
|
||||
|
||||
const synthDeepgram = async(logger, {credentials, stats, model, key, text, renderForCaching, disableTtsStreaming}) => {
|
||||
const synthDeepgram = async(logger, {credentials, stats, model, key, text, renderForCaching,
|
||||
disableTtsStreaming, disableTtsCache}) => {
|
||||
const {api_key, deepgram_tts_uri} = credentials;
|
||||
if (!JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) {
|
||||
let params = '{';
|
||||
@@ -1255,7 +1264,7 @@ const synthDeepgram = async(logger, {credentials, stats, model, key, text, rende
|
||||
params += `,playback_id=${key}`;
|
||||
params += ',vendor=deepgram';
|
||||
params += `,voice=${model}`;
|
||||
params += ',write_cache_file=1';
|
||||
params += `,write_cache_file=${disableTtsCache ? 0 : 1}`;
|
||||
if (deepgram_tts_uri) params += `,endpoint=${deepgram_tts_uri}`;
|
||||
params += '}';
|
||||
|
||||
@@ -1288,7 +1297,7 @@ const synthDeepgram = async(logger, {credentials, stats, model, key, text, rende
|
||||
};
|
||||
|
||||
const synthCartesia = async(logger, {
|
||||
credentials, options, stats, voice, language, key, text, renderForCaching, disableTtsStreaming
|
||||
credentials, options, stats, voice, language, key, text, renderForCaching, disableTtsStreaming, disableTtsCache
|
||||
}) => {
|
||||
const {api_key, model_id, embedding, options: credOpts} = credentials;
|
||||
const opts = !!options && Object.keys(options).length !== 0 ? options : JSON.parse(credOpts || '{}');
|
||||
@@ -1300,7 +1309,7 @@ const synthCartesia = async(logger, {
|
||||
params += `,model_id=${model_id}`;
|
||||
params += ',vendor=cartesia';
|
||||
params += `,voice=${voice}`;
|
||||
params += ',write_cache_file=1';
|
||||
params += `,write_cache_file=${disableTtsCache ? 0 : 1}`;
|
||||
params += `,language=${language}`;
|
||||
params += `,voice_mode=${embedding ? 'embedding' : 'id'}`;
|
||||
if (embedding) params += `,embedding=${embedding}`;
|
||||
@@ -1359,7 +1368,7 @@ const synthCartesia = async(logger, {
|
||||
};
|
||||
|
||||
const synthResemble = async(logger, {
|
||||
credentials, options, stats, voice, key, text, renderForCaching, disableTtsStreaming
|
||||
credentials, options, stats, voice, key, text, renderForCaching, disableTtsStreaming, disableTtsCache
|
||||
}) => {
|
||||
const {api_key, resemble_tts_uri, resemble_tts_use_tls} = credentials;
|
||||
const {project_uuid, use_hd} = options || {};
|
||||
@@ -1371,7 +1380,7 @@ const synthResemble = async(logger, {
|
||||
params += `,playback_id=${key}`;
|
||||
params += ',vendor=resemble';
|
||||
params += `,voice=${voice}`;
|
||||
params += ',write_cache_file=1';
|
||||
params += `,write_cache_file=${disableTtsCache ? 0 : 1}`;
|
||||
if (project_uuid) params += `,project_uuid=${project_uuid}`;
|
||||
if (use_hd) params += `,use_hd=${use_hd}`;
|
||||
if (resemble_tts_uri) params += `,endpoint=${resemble_tts_uri}`;
|
||||
|
||||
Reference in New Issue
Block a user