Merge pull request #130 from jambonz/feat/disableTtsCache

set write_cache_file = 0 when disableTtsCache
This commit is contained in:
Dave Horton
2025-10-03 02:21:57 -04:00
committed by GitHub

View File

@@ -210,13 +210,14 @@ async function synthAudio(client, createHash, retrieveHash, logger, stats, { acc
case 'polly':
vendorLabel = 'aws';
audioData = await synthPolly(createHash, retrieveHash, logger,
{credentials, stats, language, voice, key, text, engine, renderForCaching, disableTtsStreaming});
{credentials, stats, language, voice, key, text, engine, renderForCaching, disableTtsStreaming,
disableTtsCache});
break;
case 'azure':
case 'microsoft':
vendorLabel = 'microsoft';
audioData = await synthMicrosoft(logger, {credentials, stats, language, voice, key, text, deploymentId,
renderForCaching, disableTtsStreaming});
renderForCaching, disableTtsStreaming, disableTtsCache});
break;
case 'nuance':
model = model || 'enhanced';
@@ -224,7 +225,7 @@ async function synthAudio(client, createHash, retrieveHash, logger, stats, { acc
break;
case 'nvidia':
audioData = await synthNvidia(client, logger, {credentials, stats, language, voice, model, key, text,
renderForCaching, disableTtsStreaming});
renderForCaching, disableTtsStreaming, disableTtsCache});
break;
case 'ibm':
audioData = await synthIbm(logger, {credentials, stats, voice, key, text});
@@ -234,44 +235,50 @@ async function synthAudio(client, createHash, retrieveHash, logger, stats, { acc
break;
case 'elevenlabs':
audioData = await synthElevenlabs(logger, {
credentials, options, stats, language, voice, key, text, renderForCaching, disableTtsStreaming});
credentials, options, stats, language, voice, key, text, renderForCaching, disableTtsStreaming,
disableTtsCache});
break;
case 'playht':
audioData = await synthPlayHT(client, logger, {
credentials, options, stats, language, voice, key, text, renderForCaching, disableTtsStreaming});
credentials, options, stats, language, voice, key, text, renderForCaching, disableTtsStreaming,
disableTtsCache});
break;
case 'cartesia':
audioData = await synthCartesia(logger, {
credentials, options, stats, language, voice, key, text, renderForCaching, disableTtsStreaming});
credentials, options, stats, language, voice, key, text, renderForCaching, disableTtsStreaming,
disableTtsCache});
break;
case 'inworld':
audioData = await synthInworld(logger, {
credentials, options, stats, language, voice, key, text, renderForCaching, disableTtsStreaming});
credentials, options, stats, language, voice, key, text, renderForCaching, disableTtsStreaming,
disableTtsCache});
break;
case 'rimelabs':
audioData = await synthRimelabs(logger, {
credentials, options, stats, language, voice, key, text, renderForCaching, disableTtsStreaming});
credentials, options, stats, language, voice, key, text, renderForCaching, disableTtsStreaming,
disableTtsCache});
break;
case 'whisper':
audioData = await synthWhisper(logger, {
credentials, stats, voice, key, text, instructions, renderForCaching, disableTtsStreaming});
credentials, stats, voice, key, text, instructions, renderForCaching, disableTtsStreaming,
disableTtsCache});
break;
case 'verbio':
audioData = await synthVerbio(client, logger, {
credentials, stats, voice, key, text, renderForCaching, disableTtsStreaming});
credentials, stats, voice, key, text, renderForCaching, disableTtsStreaming, disableTtsCache});
if (audioData?.filePath) return audioData;
break;
case 'deepgram':
audioData = await synthDeepgram(logger, {credentials, stats, model, key, text,
renderForCaching, disableTtsStreaming});
renderForCaching, disableTtsStreaming, disableTtsCache});
break;
case 'resemble':
audioData = await synthResemble(logger, {
credentials, stats, voice, key, text, options, renderForCaching, disableTtsStreaming});
credentials, stats, voice, key, text, options, renderForCaching, disableTtsStreaming, disableTtsCache});
break;
case vendor.startsWith('custom') ? vendor : 'cant_match_value':
audioData = await synthCustomVendor(logger,
{credentials, stats, language, voice, key, text, renderForCaching, disableTtsStreaming});
{credentials, stats, language, voice, key, text, renderForCaching, disableTtsStreaming, disableTtsCache});
break;
default:
assert(`synthAudio: unsupported speech vendor ${vendor}`);
@@ -306,14 +313,14 @@ async function synthAudio(client, createHash, retrieveHash, logger, stats, { acc
}
const synthPolly = async(createHash, retrieveHash, logger,
{credentials, stats, language, voice, engine, key, text, renderForCaching, disableTtsStreaming}) => {
{credentials, stats, language, voice, engine, key, text, renderForCaching, disableTtsStreaming, disableTtsCache}) => {
const {region, accessKeyId, secretAccessKey, roleArn} = credentials;
if (!JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) {
let params = '{';
params += `language=${language}`;
params += `,playback_id=${key}`;
params += ',write_cache_file=1';
params += `,write_cache_file=${disableTtsCache ? 0 : 1}`;
params += ',vendor=aws';
if (accessKeyId && secretAccessKey) {
if (accessKeyId) params += `,accessKeyId=${accessKeyId}`;
@@ -563,7 +570,8 @@ const synthMicrosoft = async(logger, {
key,
text,
renderForCaching,
disableTtsStreaming
disableTtsStreaming,
disableTtsCache
}) => {
try {
const {api_key: apiKey, region, use_custom_tts, custom_tts_endpoint, custom_tts_endpoint_url} = credentials;
@@ -596,7 +604,7 @@ const synthMicrosoft = async(logger, {
params += `,language=${language}`;
params += ',vendor=microsoft';
params += `,voice=${voice}`;
params += ',write_cache_file=1';
params += `,write_cache_file=${disableTtsCache ? 0 : 1}`;
if (region) params += `,region=${region}`;
if (custom_tts_endpoint) params += `,endpointId=${custom_tts_endpoint}`;
if (custom_tts_endpoint_url) params += `,endpoint=${custom_tts_endpoint_url}`;
@@ -769,7 +777,7 @@ const synthNuance = async(client, logger, {credentials, stats, voice, model, tex
};
const synthNvidia = async(client, logger, {
credentials, stats, language, voice, model, key, text, renderForCaching, disableTtsStreaming
credentials, stats, language, voice, model, key, text, renderForCaching, disableTtsStreaming, disableTtsCache
}) => {
const {riva_server_uri} = credentials;
if (!JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) {
@@ -778,7 +786,7 @@ const synthNvidia = async(client, logger, {
params += `,playback_id=${key}`;
params += `,voice=${voice}`;
params += `,language=${language}`;
params += ',write_cache_file=1';
params += `,write_cache_file=${disableTtsCache ? 0 : 1}`;
params += '}';
return {
@@ -819,7 +827,7 @@ const synthNvidia = async(client, logger, {
const synthCustomVendor = async(logger, {credentials, stats, language, voice,
text, filePath, renderForCaching, disableTtsStreaming, key}) => {
text, filePath, renderForCaching, disableTtsStreaming, key, disableTtsCache}) => {
const {vendor, auth_token, custom_tts_url} = credentials;
if (!JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) {
@@ -829,7 +837,7 @@ const synthCustomVendor = async(logger, {credentials, stats, language, voice,
params += `,custom_tts_url=${custom_tts_url}`;
params += ',vendor=custom';
params += `,voice=${voice}`;
params += ',write_cache_file=1';
params += `,write_cache_file=${disableTtsCache ? 0 : 1}`;
params += '}';
return {
@@ -867,7 +875,7 @@ const synthCustomVendor = async(logger, {credentials, stats, language, voice,
};
const synthElevenlabs = async(logger, {
credentials, options, stats, voice, key, text, renderForCaching, disableTtsStreaming
credentials, options, stats, voice, key, text, renderForCaching, disableTtsStreaming, disableTtsCache
}) => {
const {api_key, model_id, options: credOpts} = credentials;
const opts = !!options && Object.keys(options).length !== 0 ? options : JSON.parse(credOpts || '{}');
@@ -880,7 +888,7 @@ const synthElevenlabs = async(logger, {
params += ',vendor=elevenlabs';
params += `,voice=${voice}`;
params += `,model_id=${model_id}`;
params += ',write_cache_file=1';
params += `,write_cache_file=${disableTtsCache ? 0 : 1}`;
if (opts.optimize_streaming_latency !== null && opts.optimize_streaming_latency !== undefined) {
params += `,optimize_streaming_latency=${opts.optimize_streaming_latency}`;
}
@@ -933,7 +941,7 @@ const synthElevenlabs = async(logger, {
};
const synthPlayHT = async(client, logger, {
credentials, options, stats, voice, language, key, text, renderForCaching, disableTtsStreaming
credentials, options, stats, voice, language, key, text, renderForCaching, disableTtsStreaming, disableTtsCache
}) => {
const {api_key, user_id, voice_engine, playht_tts_uri, options: credOpts} = credentials;
const opts = !!options && Object.keys(options).length !== 0 ? options : JSON.parse(credOpts || '{}');
@@ -975,7 +983,7 @@ const synthPlayHT = async(client, logger, {
params += `,voice=${voice}`;
params += `,voice_engine=${voice_engine}`;
params += `,synthesize_url=${synthesizeUrl}`;
params += ',write_cache_file=1';
params += `,write_cache_file=${disableTtsCache ? 0 : 1}`;
params += `,language=${language}`;
if (opts.quality) params += `,quality=${opts.quality}`;
if (opts.speed) params += `,speed=${opts.speed}`;
@@ -1028,7 +1036,7 @@ const synthPlayHT = async(client, logger, {
};
const synthInworld = async(logger, {
credentials, options, stats, voice, key, text, renderForCaching, disableTtsStreaming
credentials, options, stats, voice, key, text, renderForCaching, disableTtsStreaming, disableTtsCache
}) => {
const {api_key, model_id, options: credOpts} = credentials;
const opts = !!options && Object.keys(options).length !== 0 ? options : JSON.parse(credOpts || '{}');
@@ -1041,7 +1049,7 @@ const synthInworld = async(logger, {
params += `,model_id=${model_id}`;
params += ',vendor=inworld';
params += `,voice=${voice}`;
params += ',write_cache_file=1';
params += `,write_cache_file=${disableTtsCache ? 0 : 1}`;
if (opts.temperature) params += `,temperature=${opts.temperature}`;
if (opts.audioConfig?.pitch) params += `,pitch=${opts.pitch}`;
if (opts.audioConfig?.speakingRate) params += `,speakingRate=${opts.speakingRate}`;
@@ -1093,7 +1101,7 @@ const synthInworld = async(logger, {
};
const synthRimelabs = async(logger, {
credentials, options, stats, language, voice, key, text, renderForCaching, disableTtsStreaming
credentials, options, stats, language, voice, key, text, renderForCaching, disableTtsStreaming, disableTtsCache
}) => {
const {api_key, model_id, options: credOpts} = credentials;
const opts = !!options && Object.keys(options).length !== 0 ? options : JSON.parse(credOpts || '{}');
@@ -1107,7 +1115,7 @@ const synthRimelabs = async(logger, {
params += ',vendor=rimelabs';
params += `,language=${language}`;
params += `,voice=${voice}`;
params += ',write_cache_file=1';
params += `,write_cache_file=${disableTtsCache ? 0 : 1}`;
if (opts.speedAlpha) params += `,speed_alpha=${opts.speedAlpha}`;
if (opts.reduceLatency) params += `,reduce_latency=${opts.reduceLatency}`;
// Arcana model parameters
@@ -1151,7 +1159,7 @@ const synthRimelabs = async(logger, {
}
};
const synthVerbio = async(client, logger, {
credentials, stats, voice, key, text, renderForCaching, disableTtsStreaming
credentials, stats, voice, key, text, renderForCaching, disableTtsStreaming, disableTtsCache
}) => {
//https://doc.speechcenter.verbio.com/#tag/Text-To-Speech-REST-API
if (text.length > 2000) {
@@ -1164,7 +1172,7 @@ const synthVerbio = async(client, logger, {
params += `,playback_id=${key}`;
params += ',vendor=verbio';
params += `,voice=${voice}`;
params += ',write_cache_file=1';
params += `,write_cache_file=${disableTtsCache ? 0 : 1}`;
params += '}';
return {
@@ -1199,7 +1207,7 @@ const synthVerbio = async(client, logger, {
};
const synthWhisper = async(logger, {credentials, stats, voice, key, text, instructions,
renderForCaching, disableTtsStreaming}) => {
renderForCaching, disableTtsStreaming, disableTtsCache}) => {
const {api_key, model_id, baseURL, timeout, speed} = credentials;
/* if the env is set to stream then bag out, unless we are specifically rendering to generate a cache file */
if (!JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) {
@@ -1209,7 +1217,7 @@ const synthWhisper = async(logger, {credentials, stats, voice, key, text, instru
params += `,model_id=${model_id}`;
params += ',vendor=whisper';
params += `,voice=${voice}`;
params += ',write_cache_file=1';
params += `,write_cache_file=${disableTtsCache ? 0 : 1}`;
if (speed) params += `,speed=${speed}`;
// comma is used to separated parameters in freeswitch tts module
if (instructions) params += `,instructions=${instructions.replace(/\n/g, ' ').replace(/,/g, ';')}`;
@@ -1247,7 +1255,8 @@ const synthWhisper = async(logger, {credentials, stats, voice, key, text, instru
}
};
const synthDeepgram = async(logger, {credentials, stats, model, key, text, renderForCaching, disableTtsStreaming}) => {
const synthDeepgram = async(logger, {credentials, stats, model, key, text, renderForCaching,
disableTtsStreaming, disableTtsCache}) => {
const {api_key, deepgram_tts_uri} = credentials;
if (!JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) {
let params = '{';
@@ -1255,7 +1264,7 @@ const synthDeepgram = async(logger, {credentials, stats, model, key, text, rende
params += `,playback_id=${key}`;
params += ',vendor=deepgram';
params += `,voice=${model}`;
params += ',write_cache_file=1';
params += `,write_cache_file=${disableTtsCache ? 0 : 1}`;
if (deepgram_tts_uri) params += `,endpoint=${deepgram_tts_uri}`;
params += '}';
@@ -1288,7 +1297,7 @@ const synthDeepgram = async(logger, {credentials, stats, model, key, text, rende
};
const synthCartesia = async(logger, {
credentials, options, stats, voice, language, key, text, renderForCaching, disableTtsStreaming
credentials, options, stats, voice, language, key, text, renderForCaching, disableTtsStreaming, disableTtsCache
}) => {
const {api_key, model_id, embedding, options: credOpts} = credentials;
const opts = !!options && Object.keys(options).length !== 0 ? options : JSON.parse(credOpts || '{}');
@@ -1300,7 +1309,7 @@ const synthCartesia = async(logger, {
params += `,model_id=${model_id}`;
params += ',vendor=cartesia';
params += `,voice=${voice}`;
params += ',write_cache_file=1';
params += `,write_cache_file=${disableTtsCache ? 0 : 1}`;
params += `,language=${language}`;
params += `,voice_mode=${embedding ? 'embedding' : 'id'}`;
if (embedding) params += `,embedding=${embedding}`;
@@ -1359,7 +1368,7 @@ const synthCartesia = async(logger, {
};
const synthResemble = async(logger, {
credentials, options, stats, voice, key, text, renderForCaching, disableTtsStreaming
credentials, options, stats, voice, key, text, renderForCaching, disableTtsStreaming, disableTtsCache
}) => {
const {api_key, resemble_tts_uri, resemble_tts_use_tls} = credentials;
const {project_uuid, use_hd} = options || {};
@@ -1371,7 +1380,7 @@ const synthResemble = async(logger, {
params += `,playback_id=${key}`;
params += ',vendor=resemble';
params += `,voice=${voice}`;
params += ',write_cache_file=1';
params += `,write_cache_file=${disableTtsCache ? 0 : 1}`;
if (project_uuid) params += `,project_uuid=${project_uuid}`;
if (use_hd) params += `,use_hd=${use_hd}`;
if (resemble_tts_uri) params += `,endpoint=${resemble_tts_uri}`;