Merge pull request #130 from jambonz/feat/disableTtsCache

set write_cache_file = 0 when disableTtsCache
2026-07-04 19:31:49 +00:00 · 2025-10-03 02:21:57 -04:00
parent fdb56cbc77 6fecb8755d
commit eb4e1a773f
1 changed files with 48 additions and 39 deletions
@@ -210,13 +210,14 @@ async function synthAudio(client, createHash, retrieveHash, logger, stats, { acc
      case 'polly':
        vendorLabel = 'aws';
        audioData = await synthPolly(createHash, retrieveHash, logger,
-          {credentials, stats, language, voice, key, text, engine, renderForCaching, disableTtsStreaming});
+          {credentials, stats, language, voice, key, text, engine, renderForCaching, disableTtsStreaming,
+            disableTtsCache});
        break;
      case 'azure':
      case 'microsoft':
        vendorLabel = 'microsoft';
        audioData = await synthMicrosoft(logger, {credentials, stats, language, voice, key, text, deploymentId,
-          renderForCaching, disableTtsStreaming});
+          renderForCaching, disableTtsStreaming, disableTtsCache});
        break;
      case 'nuance':
        model = model || 'enhanced';
@@ -224,7 +225,7 @@ async function synthAudio(client, createHash, retrieveHash, logger, stats, { acc
        break;
      case 'nvidia':
        audioData = await synthNvidia(client, logger, {credentials, stats, language, voice, model, key, text,
-          renderForCaching, disableTtsStreaming});
+          renderForCaching, disableTtsStreaming, disableTtsCache});
        break;
      case 'ibm':
        audioData = await synthIbm(logger, {credentials, stats, voice, key, text});
@@ -234,44 +235,50 @@ async function synthAudio(client, createHash, retrieveHash, logger, stats, { acc
        break;
      case 'elevenlabs':
        audioData = await synthElevenlabs(logger, {
-          credentials, options, stats, language, voice, key, text, renderForCaching, disableTtsStreaming});
+          credentials, options, stats, language, voice, key, text, renderForCaching, disableTtsStreaming,
+          disableTtsCache});
        break;
      case 'playht':
        audioData = await synthPlayHT(client, logger, {
-          credentials, options, stats, language, voice, key, text, renderForCaching, disableTtsStreaming});
+          credentials, options, stats, language, voice, key, text, renderForCaching, disableTtsStreaming,
+          disableTtsCache});
        break;
      case 'cartesia':
        audioData = await synthCartesia(logger, {
-          credentials, options, stats, language, voice, key, text, renderForCaching, disableTtsStreaming});
+          credentials, options, stats, language, voice, key, text, renderForCaching, disableTtsStreaming,
+          disableTtsCache});
        break;
      case 'inworld':
        audioData = await synthInworld(logger, {
-          credentials, options, stats, language, voice, key, text, renderForCaching, disableTtsStreaming});
+          credentials, options, stats, language, voice, key, text, renderForCaching, disableTtsStreaming,
+          disableTtsCache});
        break;
      case 'rimelabs':
        audioData = await synthRimelabs(logger, {
-          credentials, options, stats, language, voice, key, text, renderForCaching, disableTtsStreaming});
+          credentials, options, stats, language, voice, key, text, renderForCaching, disableTtsStreaming,
+          disableTtsCache});
        break;
      case 'whisper':
        audioData = await synthWhisper(logger, {
-          credentials, stats, voice, key, text, instructions, renderForCaching, disableTtsStreaming});
+          credentials, stats, voice, key, text, instructions, renderForCaching, disableTtsStreaming,
+          disableTtsCache});
        break;
      case 'verbio':
        audioData = await synthVerbio(client, logger, {
-          credentials, stats, voice, key, text, renderForCaching, disableTtsStreaming});
+          credentials, stats, voice, key, text, renderForCaching, disableTtsStreaming, disableTtsCache});
        if (audioData?.filePath) return audioData;
        break;
      case 'deepgram':
        audioData = await synthDeepgram(logger, {credentials, stats, model, key, text,
-          renderForCaching, disableTtsStreaming});
+          renderForCaching, disableTtsStreaming, disableTtsCache});
        break;
      case 'resemble':
        audioData = await synthResemble(logger, {
-          credentials, stats, voice, key, text, options, renderForCaching, disableTtsStreaming});
+          credentials, stats, voice, key, text, options, renderForCaching, disableTtsStreaming, disableTtsCache});
        break;
      case vendor.startsWith('custom') ? vendor : 'cant_match_value':
        audioData = await synthCustomVendor(logger,
-          {credentials, stats, language, voice, key, text, renderForCaching, disableTtsStreaming});
+          {credentials, stats, language, voice, key, text, renderForCaching, disableTtsStreaming, disableTtsCache});
        break;
      default:
        assert(`synthAudio: unsupported speech vendor ${vendor}`);
@@ -306,14 +313,14 @@ async function synthAudio(client, createHash, retrieveHash, logger, stats, { acc
 }

 const synthPolly = async(createHash, retrieveHash, logger,
-  {credentials, stats, language, voice, engine, key, text, renderForCaching, disableTtsStreaming}) => {
+  {credentials, stats, language, voice, engine, key, text, renderForCaching, disableTtsStreaming, disableTtsCache}) => {
  const {region, accessKeyId, secretAccessKey, roleArn} = credentials;
  if (!JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) {

    let params = '{';
    params += `language=${language}`;
    params += `,playback_id=${key}`;
-    params += ',write_cache_file=1';
+    params += `,write_cache_file=${disableTtsCache ? 0 : 1}`;
    params += ',vendor=aws';
    if (accessKeyId && secretAccessKey) {
      if (accessKeyId) params += `,accessKeyId=${accessKeyId}`;
@@ -563,7 +570,8 @@ const synthMicrosoft = async(logger, {
  key,
  text,
  renderForCaching,
-  disableTtsStreaming
+  disableTtsStreaming,
+  disableTtsCache
 }) => {
  try {
    const {api_key: apiKey, region, use_custom_tts, custom_tts_endpoint, custom_tts_endpoint_url} = credentials;
@@ -596,7 +604,7 @@ const synthMicrosoft = async(logger, {
      params += `,language=${language}`;
      params += ',vendor=microsoft';
      params += `,voice=${voice}`;
-      params += ',write_cache_file=1';
+      params += `,write_cache_file=${disableTtsCache ? 0 : 1}`;
      if (region) params += `,region=${region}`;
      if (custom_tts_endpoint) params += `,endpointId=${custom_tts_endpoint}`;
      if (custom_tts_endpoint_url) params += `,endpoint=${custom_tts_endpoint_url}`;
@@ -769,7 +777,7 @@ const synthNuance = async(client, logger, {credentials, stats, voice, model, tex
 };

 const synthNvidia = async(client, logger, {
-  credentials, stats, language,  voice, model, key, text, renderForCaching, disableTtsStreaming
+  credentials, stats, language,  voice, model, key, text, renderForCaching, disableTtsStreaming, disableTtsCache
 }) => {
  const {riva_server_uri} = credentials;
  if (!JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) {
@@ -778,7 +786,7 @@ const synthNvidia = async(client, logger, {
    params += `,playback_id=${key}`;
    params += `,voice=${voice}`;
    params += `,language=${language}`;
-    params += ',write_cache_file=1';
+    params += `,write_cache_file=${disableTtsCache ? 0 : 1}`;
    params += '}';

    return {
@@ -819,7 +827,7 @@ const synthNvidia = async(client, logger, {


 const synthCustomVendor = async(logger, {credentials, stats, language, voice,
-  text, filePath, renderForCaching, disableTtsStreaming, key}) => {
+  text, filePath, renderForCaching, disableTtsStreaming, key, disableTtsCache}) => {
  const {vendor, auth_token, custom_tts_url} = credentials;

  if (!JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) {
@@ -829,7 +837,7 @@ const synthCustomVendor = async(logger, {credentials, stats, language, voice,
    params += `,custom_tts_url=${custom_tts_url}`;
    params += ',vendor=custom';
    params += `,voice=${voice}`;
-    params += ',write_cache_file=1';
+    params += `,write_cache_file=${disableTtsCache ? 0 : 1}`;
    params += '}';

    return {
@@ -867,7 +875,7 @@ const synthCustomVendor = async(logger, {credentials, stats, language, voice,
 };

 const synthElevenlabs = async(logger, {
-  credentials, options, stats, voice, key, text, renderForCaching, disableTtsStreaming
+  credentials, options, stats, voice, key, text, renderForCaching, disableTtsStreaming, disableTtsCache
 }) => {
  const {api_key, model_id, options: credOpts} = credentials;
  const opts = !!options && Object.keys(options).length !== 0 ? options : JSON.parse(credOpts || '{}');
@@ -880,7 +888,7 @@ const synthElevenlabs = async(logger, {
    params += ',vendor=elevenlabs';
    params += `,voice=${voice}`;
    params += `,model_id=${model_id}`;
-    params += ',write_cache_file=1';
+    params += `,write_cache_file=${disableTtsCache ? 0 : 1}`;
    if (opts.optimize_streaming_latency !== null && opts.optimize_streaming_latency !== undefined) {
      params += `,optimize_streaming_latency=${opts.optimize_streaming_latency}`;
    }
@@ -933,7 +941,7 @@ const synthElevenlabs = async(logger, {
 };

 const synthPlayHT = async(client, logger, {
-  credentials, options, stats, voice, language, key, text, renderForCaching, disableTtsStreaming
+  credentials, options, stats, voice, language, key, text, renderForCaching, disableTtsStreaming, disableTtsCache
 }) => {
  const {api_key, user_id, voice_engine, playht_tts_uri, options: credOpts} = credentials;
  const opts = !!options && Object.keys(options).length !== 0 ? options : JSON.parse(credOpts || '{}');
@@ -975,7 +983,7 @@ const synthPlayHT = async(client, logger, {
    params += `,voice=${voice}`;
    params += `,voice_engine=${voice_engine}`;
    params += `,synthesize_url=${synthesizeUrl}`;
-    params += ',write_cache_file=1';
+    params += `,write_cache_file=${disableTtsCache ? 0 : 1}`;
    params += `,language=${language}`;
    if (opts.quality) params += `,quality=${opts.quality}`;
    if (opts.speed) params += `,speed=${opts.speed}`;
@@ -1028,7 +1036,7 @@ const synthPlayHT = async(client, logger, {
 };

 const synthInworld = async(logger, {
-  credentials, options, stats, voice, key, text, renderForCaching, disableTtsStreaming
+  credentials, options, stats, voice, key, text, renderForCaching, disableTtsStreaming, disableTtsCache
 }) => {
  const {api_key, model_id, options: credOpts} = credentials;
  const opts = !!options && Object.keys(options).length !== 0 ? options : JSON.parse(credOpts || '{}');
@@ -1041,7 +1049,7 @@ const synthInworld = async(logger, {
    params += `,model_id=${model_id}`;
    params += ',vendor=inworld';
    params += `,voice=${voice}`;
-    params += ',write_cache_file=1';
+    params += `,write_cache_file=${disableTtsCache ? 0 : 1}`;
    if (opts.temperature) params += `,temperature=${opts.temperature}`;
    if (opts.audioConfig?.pitch) params += `,pitch=${opts.pitch}`;
    if (opts.audioConfig?.speakingRate) params += `,speakingRate=${opts.speakingRate}`;
@@ -1093,7 +1101,7 @@ const synthInworld = async(logger, {
 };

 const synthRimelabs = async(logger, {
-  credentials, options, stats, language, voice, key, text, renderForCaching, disableTtsStreaming
+  credentials, options, stats, language, voice, key, text, renderForCaching, disableTtsStreaming, disableTtsCache
 }) => {
  const {api_key, model_id, options: credOpts} = credentials;
  const opts = !!options && Object.keys(options).length !== 0 ? options : JSON.parse(credOpts || '{}');
@@ -1107,7 +1115,7 @@ const synthRimelabs = async(logger, {
    params += ',vendor=rimelabs';
    params += `,language=${language}`;
    params += `,voice=${voice}`;
-    params += ',write_cache_file=1';
+    params += `,write_cache_file=${disableTtsCache ? 0 : 1}`;
    if (opts.speedAlpha) params += `,speed_alpha=${opts.speedAlpha}`;
    if (opts.reduceLatency) params += `,reduce_latency=${opts.reduceLatency}`;
    // Arcana model parameters
@@ -1151,7 +1159,7 @@ const synthRimelabs = async(logger, {
  }
 };
 const synthVerbio = async(client, logger, {
-  credentials, stats, voice, key, text, renderForCaching, disableTtsStreaming
+  credentials, stats, voice, key, text, renderForCaching, disableTtsStreaming, disableTtsCache
 }) => {
  //https://doc.speechcenter.verbio.com/#tag/Text-To-Speech-REST-API
  if (text.length > 2000) {
@@ -1164,7 +1172,7 @@ const synthVerbio = async(client, logger, {
    params += `,playback_id=${key}`;
    params += ',vendor=verbio';
    params += `,voice=${voice}`;
-    params += ',write_cache_file=1';
+    params += `,write_cache_file=${disableTtsCache ? 0 : 1}`;
    params += '}';

    return {
@@ -1199,7 +1207,7 @@ const synthVerbio = async(client, logger, {
 };

 const synthWhisper = async(logger, {credentials, stats, voice, key, text, instructions,
-  renderForCaching, disableTtsStreaming}) => {
+  renderForCaching, disableTtsStreaming, disableTtsCache}) => {
  const {api_key, model_id, baseURL, timeout, speed} = credentials;
  /* if the env is set to stream then bag out, unless we are specifically rendering to generate a cache file */
  if (!JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) {
@@ -1209,7 +1217,7 @@ const synthWhisper = async(logger, {credentials, stats, voice, key, text, instru
    params += `,model_id=${model_id}`;
    params += ',vendor=whisper';
    params += `,voice=${voice}`;
-    params += ',write_cache_file=1';
+    params += `,write_cache_file=${disableTtsCache ? 0 : 1}`;
    if (speed) params += `,speed=${speed}`;
    // comma is used to separated parameters in freeswitch tts module
    if (instructions) params += `,instructions=${instructions.replace(/\n/g, ' ').replace(/,/g, ';')}`;
@@ -1247,7 +1255,8 @@ const synthWhisper = async(logger, {credentials, stats, voice, key, text, instru
  }
 };

-const synthDeepgram = async(logger, {credentials, stats, model, key, text, renderForCaching, disableTtsStreaming}) => {
+const synthDeepgram = async(logger, {credentials, stats, model, key, text, renderForCaching,
+  disableTtsStreaming, disableTtsCache}) => {
  const {api_key, deepgram_tts_uri} = credentials;
  if (!JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) {
    let params = '{';
@@ -1255,7 +1264,7 @@ const synthDeepgram = async(logger, {credentials, stats, model, key, text, rende
    params += `,playback_id=${key}`;
    params += ',vendor=deepgram';
    params += `,voice=${model}`;
-    params += ',write_cache_file=1';
+    params += `,write_cache_file=${disableTtsCache ? 0 : 1}`;
    if (deepgram_tts_uri) params += `,endpoint=${deepgram_tts_uri}`;
    params += '}';

@@ -1288,7 +1297,7 @@ const synthDeepgram = async(logger, {credentials, stats, model, key, text, rende
 };

 const synthCartesia = async(logger, {
-  credentials, options, stats, voice, language, key, text, renderForCaching, disableTtsStreaming
+  credentials, options, stats, voice, language, key, text, renderForCaching, disableTtsStreaming, disableTtsCache
 }) => {
  const {api_key, model_id, embedding, options: credOpts} = credentials;
  const opts = !!options && Object.keys(options).length !== 0 ? options : JSON.parse(credOpts || '{}');
@@ -1300,7 +1309,7 @@ const synthCartesia = async(logger, {
    params += `,model_id=${model_id}`;
    params += ',vendor=cartesia';
    params += `,voice=${voice}`;
-    params += ',write_cache_file=1';
+    params += `,write_cache_file=${disableTtsCache ? 0 : 1}`;
    params += `,language=${language}`;
    params += `,voice_mode=${embedding ? 'embedding' : 'id'}`;
    if (embedding) params += `,embedding=${embedding}`;
@@ -1359,7 +1368,7 @@ const synthCartesia = async(logger, {
 };

 const synthResemble = async(logger, {
-  credentials, options, stats, voice, key, text, renderForCaching, disableTtsStreaming
+  credentials, options, stats, voice, key, text, renderForCaching, disableTtsStreaming, disableTtsCache
 }) => {
  const {api_key, resemble_tts_uri, resemble_tts_use_tls} = credentials;
  const {project_uuid, use_hd} = options || {};
@@ -1371,7 +1380,7 @@ const synthResemble = async(logger, {
    params += `,playback_id=${key}`;
    params += ',vendor=resemble';
    params += `,voice=${voice}`;
-    params += ',write_cache_file=1';
+    params += `,write_cache_file=${disableTtsCache ? 0 : 1}`;
    if (project_uuid) params += `,project_uuid=${project_uuid}`;
    if (use_hd) params += `,use_hd=${use_hd}`;
    if (resemble_tts_uri) params += `,endpoint=${resemble_tts_uri}`;