remove audio extension from audio key

2025-12-19 03:37:49 +00:00 · 2024-12-18 16:46:24 +07:00
parent e5b5e3d0c6
commit 7327190471
4 changed files with 196 additions and 194 deletions
--- a/lib/add-file-to-cache.js
+++ b/lib/add-file-to-cache.js
@@ -3,6 +3,27 @@ const {noopLogger, makeSynthKey} = require('./utils');
 const {JAMBONES_TTS_CACHE_DURATION_MINS} = require('./config');
 const EXPIRES = JAMBONES_TTS_CACHE_DURATION_MINS;

+function getExtensionAndSampleRate(path) {
+  const match = path.match(/\.([^.]*)$/);
+  if (!match) {
+    //default should be wav file.
+    return ['wav', 8000];
+  }
+
+  const extension = match[1];
+  const sampleRateMap = {
+    r8: 8000,
+    r16: 16000,
+    r24: 24000,
+    r44: 44100,
+    r48: 48000,
+    r96: 96000,
+  };
+
+  const sampleRate = sampleRateMap[extension] || 8000;
+  return [extension, sampleRate];
+}
+
 async function addFileToCache(client, logger, path,
  {account_sid, vendor, language, voice, deploymentId, engine, text}) {
  let key;
@@ -17,8 +38,15 @@ async function addFileToCache(client, logger, path,
      engine,
      text,
    });
+    const [extension, sampleRate] = getExtensionAndSampleRate(path);
    const audioBuffer = await fs.readFile(path);
-    await client.setex(key, EXPIRES, audioBuffer.toString('base64'));
+    await client.setex(key, EXPIRES, JSON.stringify(
+      {
+        audioContent: audioBuffer.toString('base64'),
+        extension,
+        sampleRate
+      }
+    ));
  } catch (err) {
    logger.error(err, 'addFileToCache: Error');
    return;
--- a/lib/synth-audio.js
+++ b/lib/synth-audio.js
@@ -46,7 +46,7 @@ const {
  JAMBONES_HTTP_PROXY_IP,
  JAMBONES_HTTP_PROXY_PORT,
  JAMBONES_TTS_CACHE_DURATION_MINS,
-  JAMBONES_EAGERLY_PRE_CACHE_AUDIO,
+  JAMBONES_TTS_TRIM_SILENCE,
 } = require('./config');
 const EXPIRES = JAMBONES_TTS_CACHE_DURATION_MINS;
 const OpenAI = require('openai');
@@ -91,7 +91,7 @@ async function synthAudio(client, createHash, retrieveHash, logger, stats, { acc
  vendor, language, voice, gender, text, engine, salt, model, credentials, deploymentId,
  disableTtsCache, renderForCaching = false, disableTtsStreaming, options
 }) {
-  let audioBuffer;
+  let audioData;
  let servedFromCache = false;
  let rtt;
  logger = logger || noopLogger;
@@ -172,54 +172,22 @@ async function synthAudio(client, createHash, retrieveHash, logger, stats, { acc
    text,
    renderForCaching
  });
-  let filePath;
-  // used only for custom vendor
-  let fileExtension;
-  filePath = makeFilePath({vendor, voice, key, salt, renderForCaching});
+
  debug(`synth key is ${key}`);
  let cached;
  if (!disableTtsCache) {
    cached = await client.get(key);
-    /**
-    *  If we are using tts streaming and also precaching audio, audio could have been cached by streaming (r8)
-    * or here in speech-utils due to precaching (mp3), so we need to check for both keys.
-    */
-    if (!cached && JAMBONES_EAGERLY_PRE_CACHE_AUDIO) {
-      const preCachekey = makeSynthKey({
-        account_sid,
-        vendor,
-        language: language || '',
-        voice: voice || deploymentId,
-        engine,
-        text,
-        renderForCaching: true
-      });
-      cached = await client.get(preCachekey);
-      if (cached) {
-        // Precache audio is available update filpath with precache file extension.
-        filePath = makeFilePath({vendor, voice, key, salt, renderForCaching: true});
-      }
-    }
  }
  if (cached) {
    // found in cache - extend the expiry and use it
    debug('result WAS found in cache');
    servedFromCache = true;
    stats.increment('tts.cache.requests', ['found:yes']);
-    if (vendor.startsWith('custom')) {
-      // custom vendors support multiple mime types such as: mp3, wav, r8, r16 ...etc,
-      // mime type/file extension is available when http response has header Content-type.
-      // In cache, file extension is store together with audiBuffer in a json.
-      // Normal cache audio will be base64 string
-      const payload = JSON.parse(cached);
-      filePath = filePath.replace(/\.[^\.]*$/g, payload.fileExtension);
-      audioBuffer = Buffer.from(payload.audioBuffer, 'base64');
-    } else {
-      audioBuffer = Buffer.from(cached, 'base64');
-    }
+    audioData = JSON.parse(cached);
+    // convert base64 audio to buffer
+    audioData.audioContent = Buffer.from(audioData.audioContent, 'base64');
    client.expire(key, EXPIRES).catch((err) => logger.info(err, 'Error setting expires'));
-  }
-  if (!cached) {
+  } else {
    // not found in cache - go get it from speech vendor and add to cache
    debug('result was NOT found in cache');
    stats.increment('tts.cache.requests', ['found:no']);
@@ -227,94 +195,92 @@ async function synthAudio(client, createHash, retrieveHash, logger, stats, { acc
    const startAt = process.hrtime();
    switch (vendor) {
      case 'google':
-        audioBuffer = await synthGoogle(logger, {credentials, stats, language, voice, gender, text});
+        audioData = await synthGoogle(logger, {credentials, stats, language, voice, gender, text});
        break;
      case 'aws':
      case 'polly':
        vendorLabel = 'aws';
-        audioBuffer = await synthPolly(createHash, retrieveHash, logger,
+        audioData = await synthPolly(createHash, retrieveHash, logger,
          {credentials, stats, language, voice, text, engine});
        break;
      case 'azure':
      case 'microsoft':
        vendorLabel = 'microsoft';
-        audioBuffer = await synthMicrosoft(logger, {credentials, stats, language, voice, text, deploymentId,
-          filePath, renderForCaching, disableTtsStreaming});
+        audioData = await synthMicrosoft(logger, {credentials, stats, language, voice, text, deploymentId,
+          renderForCaching, disableTtsStreaming});
        break;
      case 'nuance':
        model = model || 'enhanced';
-        audioBuffer = await synthNuance(client, logger, {credentials, stats, voice, model, text});
+        audioData = await synthNuance(client, logger, {credentials, stats, voice, model, text});
        break;
      case 'nvidia':
-        audioBuffer = await synthNvidia(client, logger, {credentials, stats, language, voice, model, text});
+        audioData = await synthNvidia(client, logger, {credentials, stats, language, voice, model, text});
        break;
      case 'ibm':
-        audioBuffer = await synthIbm(logger, {credentials, stats, voice, text});
+        audioData = await synthIbm(logger, {credentials, stats, voice, text});
        break;
      case 'wellsaid':
-        audioBuffer = await synthWellSaid(logger, {credentials, stats, language, voice, text, filePath});
+        audioData = await synthWellSaid(logger, {credentials, stats, language, voice, text});
        break;
      case 'elevenlabs':
-        audioBuffer = await synthElevenlabs(logger, {
-          credentials, options, stats, language, voice, text, renderForCaching, disableTtsStreaming, filePath
-        });
+        audioData = await synthElevenlabs(logger, {
+          credentials, options, stats, language, voice, text, renderForCaching, disableTtsStreaming});
        break;
      case 'playht':
-        audioBuffer = await synthPlayHT(client, logger, {
-          credentials, options, stats, language, voice, text, renderForCaching, disableTtsStreaming, filePath
-        });
+        audioData = await synthPlayHT(client, logger, {
+          credentials, options, stats, language, voice, text, renderForCaching, disableTtsStreaming});
        break;
      case 'cartesia':
-        audioBuffer = await synthCartesia(logger, {
-          credentials, options, stats, language, voice, text, renderForCaching, disableTtsStreaming, filePath
-        });
+        audioData = await synthCartesia(logger, {
+          credentials, options, stats, language, voice, text, renderForCaching, disableTtsStreaming});
        break;
      case 'rimelabs':
-        audioBuffer = await synthRimelabs(logger, {
-          credentials, options, stats, language, voice, text, renderForCaching, disableTtsStreaming, filePath
-        });
+        audioData = await synthRimelabs(logger, {
+          credentials, options, stats, language, voice, text, renderForCaching, disableTtsStreaming});
        break;
      case 'whisper':
-        audioBuffer = await synthWhisper(logger, {
+        audioData = await synthWhisper(logger, {
          credentials, stats, voice, text, renderForCaching, disableTtsStreaming});
        break;
      case 'verbio':
-        audioBuffer = await synthVerbio(client, logger, {
+        audioData = await synthVerbio(client, logger, {
          credentials, stats, voice, text, renderForCaching, disableTtsStreaming});
-        if (audioBuffer?.filePath) return audioBuffer;
+        if (audioData?.filePath) return audioData;
        break;
      case 'deepgram':
-        audioBuffer = await synthDeepgram(logger, {credentials, stats, model, text,
+        audioData = await synthDeepgram(logger, {credentials, stats, model, text,
          renderForCaching, disableTtsStreaming});
        break;
      case vendor.startsWith('custom') ? vendor : 'cant_match_value':
-        ({ audioBuffer, filePath, fileExtension } = await synthCustomVendor(logger,
-          {credentials, stats, language, voice, text, filePath}));
+        audioData = await synthCustomVendor(logger,
+          {credentials, stats, language, voice, text});
        break;
      default:
        assert(`synthAudio: unsupported speech vendor ${vendor}`);
    }
-    if ('filePath' in audioBuffer) return audioBuffer;
+    if ('filePath' in audioData) return audioData;
    const diff = process.hrtime(startAt);
    const time = diff[0] * 1e3 + diff[1] * 1e-6;
    rtt = time.toFixed(0);
    stats.histogram('tts.response_time', rtt, [`vendor:${vendorLabel}`]);
    debug(`tts rtt time for ${text.length} chars on ${vendorLabel}: ${rtt}`);
    logger.info(`tts rtt time for ${text.length} chars on ${vendorLabel}: ${rtt}`);
-
-    const base64Audio = audioBuffer.toString('base64');
-    const cacheContent = vendor.startsWith('custom') ?
-      JSON.stringify({
-        audioBuffer: base64Audio,
-        fileExtension
-      }) : base64Audio;
-
-    client.setex(key, EXPIRES, cacheContent)
+    // Save audio json to cache
+    client.setex(key, EXPIRES, JSON.stringify({
+      ...audioData,
+      audioContent: audioData.audioContent?.toString('base64')
+    }))
      .catch((err) => logger.error(err, `error calling setex on key ${key}`));
  }

  return new Promise((resolve, reject) => {
-    fs.writeFile(filePath, audioBuffer, (err) => {
+    const { audioContent, extension } = audioData;
+    const filePath = makeFilePath({
+      key,
+      salt,
+      extension
+    });
+    fs.writeFile(filePath, audioContent, (err) => {
      if (err) return reject(err);
      resolve({filePath, servedFromCache, rtt});
    });
@@ -369,7 +335,13 @@ const synthPolly = async(createHash, retrieveHash, logger,
        .on('data', (chunk) => {
          chunks.push(chunk);
        })
-        .on('end', () => resolve(Buffer.concat(chunks)));
+        .on('end', () => resolve(
+          {
+            audioContent: Buffer.concat(chunks),
+            extension: 'mp3',
+            sampleRate: 8000
+          }
+        ));
    });
  } catch (err) {
    logger.info({err}, 'synthAudio: Error synthesizing speech using aws polly');
@@ -411,7 +383,11 @@ const synthGoogle = async(logger, {credentials, stats, language, voice, gender,
      };

      const wav = await post('/v1beta1/text:synthesize', payload);
-      return Buffer.from(wav.audioContent, 'base64');
+      return {
+        audioContent: Buffer.from(wav.audioContent, 'base64'),
+        extension: 'wav',
+        sampleRate: 24000
+      };
    } catch (err) {
      logger.info({err: await err.text()}, 'synthGoogle returned error');
      throw err;
@@ -432,7 +408,11 @@ const synthGoogle = async(logger, {credentials, stats, language, voice, gender,
    const responses = await client.synthesizeSpeech(opts);
    stats.increment('tts.count', ['vendor:google', 'accepted:yes']);
    client.close();
-    return responses[0].audioContent;
+    return {
+      audioContent: responses[0].audioContent,
+      extension: 'mp3',
+      sampleRate: 8000
+    };
  } catch (err) {
    console.error(err);
    logger.info({err, opts}, 'synthAudio: Error synthesizing speech using google');
@@ -463,7 +443,11 @@ const synthIbm = async(logger, {credentials, stats, voice, text}) => {
    for await (const chunk of r.result) {
      chunks.push(chunk);
    }
-    return Buffer.concat(chunks);
+    return {
+      audioContent: Buffer.concat(chunks),
+      extension: 'mp3',
+      sampleRate: 8000
+    };
  } catch (err) {
    logger.info({err, params}, 'synthAudio: Error synthesizing speech using ibm');
    stats.increment('tts.count', ['vendor:ibm', 'accepted:no']);
@@ -473,11 +457,9 @@ const synthIbm = async(logger, {credentials, stats, voice, text}) => {

 async function _synthOnPremMicrosoft(logger, {
  credentials,
-  stats,
  language,
  voice,
-  text,
-  filePath
+  text
 }) {
  const {use_custom_tts, custom_tts_endpoint_url, api_key} = credentials;
  let content = text;
@@ -499,15 +481,19 @@ async function _synthOnPremMicrosoft(logger, {
  }

  try {
-    const trimSilence = filePath.endsWith('.r8');
+    const trimSilence = JAMBONES_TTS_TRIM_SILENCE;
    const post = bent('POST', 'buffer', {
      'X-Microsoft-OutputFormat': trimSilence ? 'raw-8khz-16bit-mono-pcm' : 'audio-16khz-32kbitrate-mono-mp3',
      'Content-Type': 'application/ssml+xml',
      'User-Agent': 'Jambonz',
      ...(api_key && {'Ocp-Apim-Subscription-Key': api_key})
    });
-    const mp3 = await post(custom_tts_endpoint_url, content);
-    return mp3;
+    const audioContent = await post(custom_tts_endpoint_url, content);
+    return {
+      audioContent,
+      extension: trimSilence ? 'r8' : 'mp3',
+      sampleRate: 8000
+    };
  } catch (err) {
    logger.info({err}, '_synthMicrosoftByHttp returned error');
    throw err;
@@ -520,7 +506,6 @@ const synthMicrosoft = async(logger, {
  language,
  voice,
  text,
-  filePath,
  renderForCaching,
  disableTtsStreaming
 }) => {
@@ -563,17 +548,18 @@ const synthMicrosoft = async(logger, {
        rtt: 0
      };
    }
+    // Azure Onprem
    if (use_custom_tts && custom_tts_endpoint_url) {
      return await _synthOnPremMicrosoft(logger, {
        credentials,
        stats,
        language,
        voice,
-        text,
-        filePath
+        text
      });
    }
-    const trimSilence = filePath.endsWith('.r8');
+    // Azure hosted service
+    const trimSilence = JAMBONES_TTS_TRIM_SILENCE;
    const speechConfig = SpeechConfig.fromSubscription(apiKey, region);
    speechConfig.speechSynthesisLanguage = language;
    speechConfig.speechSynthesisVoiceName = voice;
@@ -608,7 +594,11 @@ const synthMicrosoft = async(logger, {
            case ResultReason.SynthesizingAudioCompleted:
              let buffer = Buffer.from(result.audioData);
              if (trimSilence) buffer = trimTrailingSilence(buffer);
-              resolve(buffer);
+              resolve({
+                audioContent: buffer,
+                extension: trimSilence ? 'r8' : 'mp3',
+                sampleRate: 8000
+              });
              synthesizer.close();
              stats.increment('tts.count', ['vendor:microsoft', 'accepted:yes']);
              break;
@@ -638,11 +628,15 @@ const synthWellSaid = async(logger, {credentials, stats, language, voice, gender
      'Accept': 'audio/mpeg',
      'Content-Type': 'application/json'
    });
-    const mp3 = await post('/v1/tts/stream', {
+    const audioContent = await post('/v1/tts/stream', {
      text,
      speaker_id: voice
    });
-    return mp3;
+    return {
+      audioContent,
+      extension: 'mp3',
+      sampleRate: 8000
+    };
  } catch (err) {
    logger.info({err}, 'testWellSaidTts returned error');
    throw err;
@@ -679,8 +673,8 @@ const synthNuance = async(client, logger, {credentials, stats, voice, model, tex
    t.setText(text);
    input.setText(t);
  }
-
-  pcm.setSampleRateHz(8000);
+  const sampleRate = 8000;
+  pcm.setSampleRateHz(sampleRate);
  f.setPcm(pcm);
  p.setAudioFormat(f);
  v.setName(voice);
@@ -704,7 +698,11 @@ const synthNuance = async(client, logger, {credentials, stats, voice, model, tex
        const details = status.getDetails();
        return reject({code, message, details});
      }
-      resolve(Buffer.from(response.getAudio()));
+      resolve({
+        audioContent: Buffer.from(response.getAudio()),
+        extension: 'r8',
+        sampleRate
+      });
    });
  });
 };
@@ -712,12 +710,13 @@ const synthNuance = async(client, logger, {credentials, stats, voice, model, tex
 const synthNvidia = async(client, logger, {credentials, stats, language,  voice, model, text}) => {
  const {riva_server_uri} = credentials;
  let rivaClient, request;
+  const sampleRate = 8000;
  try {
    rivaClient = await createRivaClient(riva_server_uri);
    request = new SynthesizeSpeechRequest();
    request.setVoiceName(voice);
    request.setLanguageCode(language);
-    request.setSampleRateHz(8000);
+    request.setSampleRateHz(sampleRate);
    request.setEncoding(AudioEncoding.LINEAR_PCM);
    request.setText(text);
  } catch (err) {
@@ -731,7 +730,11 @@ const synthNvidia = async(client, logger, {credentials, stats, language,  voice,
        logger.info({err, voice, language}, 'error synthesizing speech using Nvidia');
        return reject(err);
      }
-      resolve(Buffer.from(response.getAudio()));
+      resolve({
+        audioContent: Buffer.from(response.getAudio()),
+        extension: 'r8',
+        sampleRate
+      });
    });
  });
 };
@@ -753,14 +756,13 @@ const synthCustomVendor = async(logger, {credentials, stats, language, voice, te
      text
    });

-    const regex = /\.[^\.]*$/g;
    const mime = response.headers['content-type'];
    const buffer = await response.arrayBuffer();
-    const fileExtension = getFileExtFromMime(mime);
+    const [extension, sampleRate] = getFileExtFromMime(mime);
    return {
-      audioBuffer: buffer,
-      filePath: filePath.replace(regex, fileExtension),
-      fileExtension
+      audioContent: buffer,
+      extension,
+      sampleRate
    };
  } catch (err) {
    logger.info({err}, `Vendor ${vendor} returned error`);
@@ -806,7 +808,7 @@ const synthElevenlabs = async(logger, {
      'Accept': 'audio/mpeg',
      'Content-Type': 'application/json'
    });
-    const mp3 = await post(`/v1/text-to-speech/${voice}${optimize_streaming_latency}`, {
+    const audioContent = await post(`/v1/text-to-speech/${voice}${optimize_streaming_latency}`, {
      text,
      model_id,
      voice_settings: {
@@ -815,7 +817,11 @@ const synthElevenlabs = async(logger, {
      },
      ...opts
    });
-    return mp3;
+    return {
+      audioContent,
+      extension: 'mp3',
+      sampleRate: 8000
+    };
  } catch (err) {
    logger.info({err}, 'synth Elevenlabs returned error');
    stats.increment('tts.count', ['vendor:elevenlabs', 'accepted:no']);
@@ -897,7 +903,7 @@ const synthPlayHT = async(client, logger, {
      'Content-Type': 'application/json'
    });

-    const mp3 = await post(synthesizeUrl, {
+    const audioContent = await post(synthesizeUrl, {
      text,
      ...(voice_engine === 'Play3.0' && { language }),
      voice,
@@ -906,7 +912,11 @@ const synthPlayHT = async(client, logger, {
      sample_rate: 8000,
      ...opts
    });
-    return mp3;
+    return {
+      audioContent,
+      extension: 'mp3',
+      sampleRate: 8000
+    };
  } catch (err) {
    logger.info({err}, 'synth PlayHT returned error');
    stats.increment('tts.count', ['vendor:playht', 'accepted:no']);
@@ -945,14 +955,19 @@ const synthRimelabs = async(logger, {
      'Accept': 'audio/mp3',
      'Content-Type': 'application/json'
    });
-    const mp3 = await post('/v1/rime-tts', {
+    const sampleRate = 8000;
+    const audioContent = await post('/v1/rime-tts', {
      speaker: voice,
      text,
      modelId: model_id,
-      samplingRate: 8000,
+      samplingRate: sampleRate,
      ...opts
    });
-    return mp3;
+    return {
+      audioContent,
+      extension: 'mp3',
+      sampleRate
+    };
  } catch (err) {
    logger.info({err}, 'synth rimelabs returned error');
    stats.increment('tts.count', ['vendor:rimelabs', 'accepted:no']);
@@ -986,13 +1001,17 @@ const synthVerbio = async(client, logger, {credentials, stats, voice, text, rend
      'User-Agent': 'jambonz',
      'Content-Type': 'application/json'
    });
-    const r8 = await post('/api/v1/synthesize', {
+    const audioContent = await post('/api/v1/synthesize', {
      voice_id: voice,
      output_sample_rate: '8k',
      output_encoding: 'pcm16',
      text
    });
-    return r8;
+    return {
+      audioContent,
+      extension: 'r8',
+      sampleRate: 8000
+    };
  } catch (err) {
    logger.info({err}, 'synth Verbio returned error');
    stats.increment('tts.count', ['vendor:verbio', 'accepted:no']);
@@ -1032,7 +1051,11 @@ const synthWhisper = async(logger, {credentials, stats, voice, text, renderForCa
      input: text,
      response_format: 'mp3'
    });
-    return Buffer.from(await mp3.arrayBuffer());
+    return {
+      audioContent: Buffer.from(await mp3.arrayBuffer()),
+      extension: 'mp3',
+      sampleRate: 8000
+    };
  } catch (err) {
    logger.info({err}, 'synth whisper returned error');
    stats.increment('tts.count', ['vendor:openai', 'accepted:no']);
@@ -1064,10 +1087,14 @@ const synthDeepgram = async(logger, {credentials, stats, model, text, renderForC
      'Accept': 'audio/mpeg',
      'Content-Type': 'application/json'
    });
-    const mp3 = await post(`/v1/speak?model=${model}`, {
+    const audioContent = await post(`/v1/speak?model=${model}`, {
      text
    });
-    return mp3;
+    return {
+      audioContent,
+      extension: 'mp3',
+      sampleRate: 8000
+    };
  } catch (err) {
    logger.info({err}, 'synth Deepgram returned error');
    stats.increment('tts.count', ['vendor:deepgram', 'accepted:no']);
@@ -1104,6 +1131,7 @@ const synthCartesia = async(logger, {

  try {
    const client = new CartesiaClient({ apiKey: api_key });
+    const sampleRate = 48000;
    const mp3 = await client.tts.bytes({
      modelId: model_id,
      transcript: text,
@@ -1119,7 +1147,7 @@ const synthCartesia = async(logger, {
        ),
        ...(opts.speed || opts.emotion && {
          experimentalControls: {
-            ...(opts.speed && {speed: opts.speed}),
+            ...(opts.speed !== null && opts.speed !== undefined && {speed: opts.speed}),
            ...(opts.emotion && {emotion: opts.emotion}),
          }
        })
@@ -1128,10 +1156,14 @@ const synthCartesia = async(logger, {
      outputFormat: {
        container: 'mp3',
        bitRate: 128000,
-        sampleRate: 8000
+        sampleRate
      },
    });
-    return Buffer.from(mp3);
+    return {
+      audioContent: Buffer.from(mp3),
+      extension: 'mp3',
+      sampleRate
+    };
  } catch (err) {
    logger.info({err}, 'synth Cartesia returned error');
    stats.increment('tts.count', ['vendor:cartesia', 'accepted:no']);
@@ -1144,22 +1176,22 @@ const getFileExtFromMime = (mime) => {
  switch (mime) {
    case 'audio/wav':
    case 'audio/x-wav':
-      return '.wav';
+      return ['wav', 8000];
    case /audio\/l16.*rate=8000/.test(mime) ? mime : 'cant match value':
-      return '.r8';
+      return ['r8', 8000];
    case /audio\/l16.*rate=16000/.test(mime) ? mime : 'cant match value':
-      return '.r16';
+      return ['r16', 16000];
    case /audio\/l16.*rate=24000/.test(mime) ? mime : 'cant match value':
-      return '.r24';
+      return ['r24', 24000];
    case /audio\/l16.*rate=32000/.test(mime) ? mime : 'cant match value':
-      return '.r32';
+      return ['r32', 32000];
    case /audio\/l16.*rate=48000/.test(mime) ? mime : 'cant match value':
-      return '.r48';
+      return ['r48', 48000];
    case 'audio/mpeg':
    case 'audio/mp3':
-      return '.mp3';
+      return ['mp3', 8000];
    default:
-      return '.wav';
+      return ['wav', 8000];
  }
 };

--- a/lib/utils.js
+++ b/lib/utils.js
@@ -6,7 +6,7 @@ const pool = new Pool('https://auth.crt.nuance.com');
 const NUANCE_AUTH_ENDPOINT = 'tts.api.nuance.com:443';
 const grpc = require('@grpc/grpc-js');
 const formurlencoded = require('form-urlencoded');
-const { JAMBONES_DISABLE_TTS_STREAMING, JAMBONES_TTS_TRIM_SILENCE, TMP_FOLDER, HTTP_TIMEOUT } = require('./config');
+const { TMP_FOLDER, HTTP_TIMEOUT } = require('./config');

 const debug = require('debug')('jambonz:realtimedb-helpers');
 /**
@@ -17,68 +17,19 @@ const debug = require('debug')('jambonz:realtimedb-helpers');
 //const nuanceClientMap = new Map();

 function makeSynthKey({
-  account_sid = '', vendor, language, voice, engine = '', text,
-  renderForCaching = false}) {
+  account_sid = '', vendor, language, voice, engine = '', text}) {
  const hash = crypto.createHash('sha1');
  hash.update(`${language}:${vendor}:${voice}:${engine}:${text}`);
  const hexHashKey = hash.digest('hex');
  const accountKey = account_sid ? `:${account_sid}` : '';
-  const namespace = vendor.startsWith('custom') ? vendor : getFileExtension({vendor, voice, renderForCaching});
-  const key = `tts${accountKey}:${namespace}:${hexHashKey}`;
+  const key = `tts${accountKey}:${hexHashKey}`;
  return key;
 }

-function makeFilePath({vendor, voice, key, salt = '', renderForCaching = false}) {
-  const extension = getFileExtension({vendor, renderForCaching, voice});
+function makeFilePath({key, salt = '', extension}) {
  return `${TMP_FOLDER}/${key.replace('tts:', `tts-${salt}`)}.${extension}`;
 }

-function getFileExtension({vendor, voice, renderForCaching = false}) {
-  const mp3Extension = 'mp3';
-  const r8Extension = 'r8';
-  const wavExtension = 'wav';
-
-  switch (vendor) {
-    case 'azure':
-    case 'microsoft':
-      if (!renderForCaching && !JAMBONES_DISABLE_TTS_STREAMING || JAMBONES_TTS_TRIM_SILENCE) {
-        return r8Extension;
-      } else {
-        return mp3Extension;
-      }
-    case 'deepgram':
-    case 'elevenlabs':
-    case 'rimelabs':
-    case 'playht':
-    case 'cartesia':
-      if (renderForCaching || JAMBONES_DISABLE_TTS_STREAMING) {
-        return mp3Extension;
-      } else {
-        return r8Extension;
-      }
-    case 'nuance':
-    case 'nvidia':
-    case 'verbio':
-      return r8Extension;
-    case 'google':
-      // google voice cloning just support wav.
-      if (typeof voice === 'object' && voice.voice_cloning_key) {
-        return wavExtension;
-      } else {
-        return mp3Extension;
-      }
-    default:
-      // If vendor is custom
-      if (vendor.startsWith('custom')) {
-        if (renderForCaching || JAMBONES_DISABLE_TTS_STREAMING) {
-          return mp3Extension;
-        } else {
-          return r8Extension;
-        }
-      }
-      return mp3Extension;
-  }
-}

 const noopLogger = {
  info: () => {},
--- a/test/synth.js
+++ b/test/synth.js
@@ -888,24 +888,15 @@ test('TTS Cache tests', async(t) => {
      language: 'non-existing',
      voice: 'non-existing',
    });
-    t.ok(purgedCountWhenErrored === 0, `purged no records when specified key was not found`);
-    t.ok(error, `error returned when specified key was not found`);
+    t.ok(purgedCountWhenErrored === 0, 'purged no records when specified key was not found');
+    t.ok(error, 'error returned when specified key was not found');

    // make sure other tts keys are still there
-    const cached = await client.keys('tts:*')
-    t.ok(cached.length >= 1, `successfully kept all non-specified tts records in cache`);
+    const cached = await client.keys('tts:*');
+    t.ok(cached.length >= 1, 'successfully kept all non-specified tts records in cache');

-    // retrieve keys from cache and check the key contains the file extension    
-    let key = cached[0];
-    t.ok(key.includes('mp3'), `tts cache extension shoult be part of the key and equal mp3`);
-
-    process.env.VG_TRIM_TTS_SILENCE = 'true';    
+    process.env.VG_TRIM_TTS_SILENCE = 'true';
    await client.set(makeSynthKey({ vendor: 'azure' }), 'value');
-
-    const r8Keys = await client.keys('tts:r8*');
-    key = r8Keys[0];
-    t.ok(key.includes('r8'), `tts cache extension shoult be part of the key and equal r8`);
-
  } catch (err) {
    console.error(JSON.stringify(err));
    t.end(err);