Merge pull request #73 from Catharsis68/feat/tts-cache-improvement

Improve handling of TTS cache by adding the file extension to the cac…
2026-01-25 02:08:26 +00:00 · 2024-05-28 12:54:31 -04:00
parent e099bbb58f f13fc84853
commit 904495d819
15 changed files with 120 additions and 61 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -39,3 +39,5 @@ node_modules
 examples/*

 .vscode
+
+.env
--- a/lib/add-file-to-cache.js
+++ b/lib/add-file-to-cache.js
@@ -1,6 +1,7 @@
 const fs = require('fs/promises');
 const {noopLogger, makeSynthKey} = require('./utils');
-const EXPIRES = (process.env.JAMBONES_TTS_CACHE_DURATION_MINS || 4 * 60) * 60; // cache tts for 4 hours
+const {JAMBONES_TTS_CACHE_DURATION_MINS} = require('./config');
+const EXPIRES = JAMBONES_TTS_CACHE_DURATION_MINS;

 async function addFileToCache(client, logger, path,
  {account_sid, vendor, language, voice, deploymentId, engine, text}) {
--- a/lib/config.js
+++ b/lib/config.js
@@ -0,0 +1,20 @@
+const JAMBONES_TTS_TRIM_SILENCE = process.env.JAMBONES_TTS_TRIM_SILENCE;
+const JAMBONES_DISABLE_TTS_STREAMING = process.env.JAMBONES_DISABLE_TTS_STREAMING;
+
+const JAMBONES_HTTP_PROXY_IP = process.env.JAMBONES_HTTP_PROXY_IP;
+const JAMBONES_HTTP_PROXY_PORT = process.env.JAMBONES_HTTP_PROXY_PORT;
+const JAMBONES_TTS_CACHE_DURATION_MINS = (parseInt(process.env.JAMBONES_TTS_CACHE_DURATION_MINS) || 4 * 60) * 60; // cache tts for 4 hours
+
+const TMP_FOLDER = '/tmp';
+
+const HTTP_TIMEOUT = 5000;
+
+module.exports = {
+  JAMBONES_TTS_TRIM_SILENCE,
+  JAMBONES_DISABLE_TTS_STREAMING,
+  JAMBONES_HTTP_PROXY_IP,
+  JAMBONES_HTTP_PROXY_PORT,
+  JAMBONES_TTS_CACHE_DURATION_MINS,
+  TMP_FOLDER,
+  HTTP_TIMEOUT
+};
--- a/lib/constants.js
+++ b/lib/constants.js
@@ -1,3 +0,0 @@
-module.exports = {
-  HTTP_TIMEOUT: 5000
-};
--- a/lib/get-ibm-access-token.js
+++ b/lib/get-ibm-access-token.js
@@ -2,7 +2,7 @@ const formurlencoded = require('form-urlencoded');
 const {Pool} = require('undici');
 const pool = new Pool('https://iam.cloud.ibm.com');
 const {makeIbmKey, noopLogger} = require('./utils');
-const { HTTP_TIMEOUT } = require('./constants');
+const { HTTP_TIMEOUT } = require('./config');
 const debug = require('debug')('jambonz:realtimedb-helpers');

 async function getIbmAccessToken(client, logger, apiKey) {
--- a/lib/get-nuance-access-token.js
+++ b/lib/get-nuance-access-token.js
@@ -2,7 +2,7 @@ const formurlencoded = require('form-urlencoded');
 const {Pool} = require('undici');
 const pool = new Pool('https://auth.crt.nuance.com');
 const {makeNuanceKey, makeBasicAuthHeader, noopLogger} = require('./utils');
-const { HTTP_TIMEOUT } = require('./constants');
+const { HTTP_TIMEOUT } = require('./config');
 const debug = require('debug')('jambonz:realtimedb-helpers');

 async function getNuanceAccessToken(client, logger, clientId, secret, scope) {
--- a/lib/get-tts-voices.js
+++ b/lib/get-tts-voices.js
@@ -9,7 +9,7 @@ const ttsGoogle = require('@google-cloud/text-to-speech');
 const { PollyClient, DescribeVoicesCommand } = require('@aws-sdk/client-polly');
 const getAwsAuthToken = require('./get-aws-sts-token');
 const {Pool} = require('undici');
-const { HTTP_TIMEOUT } = require('./constants');
+const { HTTP_TIMEOUT } = require('./config');
 const verbioVoicePool = new Pool('https://us.rest.speechcenter.verbio.com');

 const getIbmVoices = async(client, logger, credentials) => {
--- a/lib/get-verbio-token.js
+++ b/lib/get-verbio-token.js
@@ -1,6 +1,6 @@
 const {Pool} = require('undici');
 const { noopLogger, makeVerbioKey } = require('./utils');
-const { HTTP_TIMEOUT } = require('./constants');
+const { HTTP_TIMEOUT } = require('./config');
 const pool = new Pool('https://auth.speechcenter.verbio.com:444');
 const debug = require('debug')('jambonz:realtimedb-helpers');

--- a/lib/synth-audio.js
+++ b/lib/synth-audio.js
@@ -19,7 +19,8 @@ const {
  createNuanceClient,
  createKryptonClient,
  createRivaClient,
-  noopLogger
+  noopLogger,
+  makeFilePath
 } = require('./utils');
 const getNuanceAccessToken = require('./get-nuance-access-token');
 const getVerbioAccessToken = require('./get-verbio-token');
@@ -37,8 +38,13 @@ const {
 const {SynthesizeSpeechRequest} = require('../stubs/riva/proto/riva_tts_pb');
 const {AudioEncoding} = require('../stubs/riva/proto/riva_audio_pb');
 const debug = require('debug')('jambonz:realtimedb-helpers');
-const EXPIRES = (process.env.JAMBONES_TTS_CACHE_DURATION_MINS || 4 * 60) * 60; // cache tts for 4 hours
-const TMP_FOLDER = '/tmp';
+const {
+  JAMBONES_DISABLE_TTS_STREAMING,
+  JAMBONES_HTTP_PROXY_IP,
+  JAMBONES_HTTP_PROXY_PORT,
+  JAMBONES_TTS_CACHE_DURATION_MINS,
+} = require('./config');
+const EXPIRES = JAMBONES_TTS_CACHE_DURATION_MINS;
 const OpenAI = require('openai');
 const getAwsAuthToken = require('./get-aws-sts-token');

@@ -154,19 +160,7 @@ async function synthAudio(client, createHash, retrieveHash, logger, stats, { acc
    text
  });
  let filePath;
-  if (['nuance', 'nvidia', 'verbio'].includes(vendor) ||
-    (
-      (process.env.JAMBONES_TTS_TRIM_SILENCE || !process.env.JAMBONES_DISABLE_TTS_STREAMING) &&
-      ['microsoft', 'azure'].includes(vendor)
-    ) ||
-    (
-      !process.env.JAMBONES_DISABLE_TTS_STREAMING &&
-      ['elevenlabs', 'deepgram', 'rimelabs'].includes(vendor)
-    )
-  ) {
-    filePath = `${TMP_FOLDER}/${key.replace('tts:', `tts-${salt || ''}`)}.r8`;
-  }
-  else filePath = `${TMP_FOLDER}/${key.replace('tts:', `tts-${salt || ''}`)}.mp3`;
+  filePath = makeFilePath(vendor, key, salt);
  debug(`synth key is ${key}`);
  let cached;
  if (!disableTtsCache) {
@@ -200,8 +194,7 @@ async function synthAudio(client, createHash, retrieveHash, logger, stats, { acc
      case 'microsoft':
        vendorLabel = 'microsoft';
        audioBuffer = await synthMicrosoft(logger, {credentials, stats, language, voice, text, deploymentId,
-          filePath, renderForCaching, disableTtsStreaming});
-        if (audioBuffer?.filePath) return audioBuffer;
+          filePath, renderForCaching, disableTtsStreaming});        
        break;
      case 'nuance':
        model = model || 'enhanced';
@@ -219,25 +212,21 @@ async function synthAudio(client, createHash, retrieveHash, logger, stats, { acc
      case 'elevenlabs':
        audioBuffer = await synthElevenlabs(logger, {
          credentials, options, stats, language, voice, text, renderForCaching, disableTtsStreaming, filePath
-        });
-        if (audioBuffer?.filePath) return audioBuffer;
+        });        
        break;
      case 'playht':
        audioBuffer = await synthPlayHT(logger, {
          credentials, options, stats, language, voice, text, renderForCaching, disableTtsStreaming, filePath
-        });
-        if (audioBuffer?.filePath) return audioBuffer;
+        });        
        break;
      case 'rimelabs':
        audioBuffer = await synthRimelabs(logger, {
          credentials, options, stats, language, voice, text, renderForCaching, disableTtsStreaming, filePath
-        });
-        if (audioBuffer?.filePath) return audioBuffer;
+        });        
        break;
      case 'whisper':
        audioBuffer = await synthWhisper(logger, {
-          credentials, stats, voice, text, renderForCaching, disableTtsStreaming});
-        if (audioBuffer?.filePath) return audioBuffer;
+          credentials, stats, voice, text, renderForCaching, disableTtsStreaming});        
        break;
      case 'verbio':
        audioBuffer = await synthVerbio(client, logger, {
@@ -246,8 +235,7 @@ async function synthAudio(client, createHash, retrieveHash, logger, stats, { acc
        break;
      case 'deepgram':
        audioBuffer = await synthDeepgram(logger, {credentials, stats, model, text,
-          renderForCaching, disableTtsStreaming});
-        if (audioBuffer?.filePath) return audioBuffer;
+          renderForCaching, disableTtsStreaming});        
        break;
      case vendor.startsWith('custom') ? vendor : 'cant_match_value':
        ({ audioBuffer, filePath } = await synthCustomVendor(logger,
@@ -256,6 +244,7 @@ async function synthAudio(client, createHash, retrieveHash, logger, stats, { acc
      default:
        assert(`synthAudio: unsupported speech vendor ${vendor}`);
    }
+    if('filePath' in audioBuffer) return audioBuffer;
    const diff = process.hrtime(startAt);
    const time = diff[0] * 1e3 + diff[1] * 1e-6;
    rtt = time.toFixed(0);
@@ -454,7 +443,7 @@ const synthMicrosoft = async(logger, {
      content = `<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xmlns:mstts="https://www.w3.org/2001/mstts" xml:lang="${language}"><voice name="${voice}">${words}</voice></speak>`;
      logger.info({content}, 'synthMicrosoft');
    }
-    if (!process.env.JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) {
+    if (!JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) {
      let params = '';
      params += `{api_key=${apiKey}`;
      params += `,language=${language}`;
@@ -464,8 +453,8 @@ const synthMicrosoft = async(logger, {
      if (region) params += `,region=${region}`;
      if (custom_tts_endpoint) params += `,endpointId=${custom_tts_endpoint}`;
      if (custom_tts_endpoint_url) params += `,endpoint=${custom_tts_endpoint_url}`;
-      if (process.env.JAMBONES_HTTP_PROXY_IP) params += `,http_proxy_ip=${process.env.JAMBONES_HTTP_PROXY_IP}`;
-      if (process.env.JAMBONES_HTTP_PROXY_PORT) params += `,http_proxy_port=${process.env.JAMBONES_HTTP_PROXY_PORT}`;
+      if (JAMBONES_HTTP_PROXY_IP) params += `,http_proxy_ip=${JAMBONES_HTTP_PROXY_IP}`;
+      if (JAMBONES_HTTP_PROXY_PORT) params += `,http_proxy_port=${JAMBONES_HTTP_PROXY_PORT}`;
      params += '}';
      return {
        filePath: `say:${params}${content.replace(/\n/g, ' ')}`,
@@ -494,10 +483,10 @@ const synthMicrosoft = async(logger, {
      SpeechSynthesisOutputFormat.Raw8Khz16BitMonoPcm :
      SpeechSynthesisOutputFormat.Audio16Khz32KBitRateMonoMp3;

-    if (process.env.JAMBONES_HTTP_PROXY_IP && process.env.JAMBONES_HTTP_PROXY_PORT) {
+    if (JAMBONES_HTTP_PROXY_IP && JAMBONES_HTTP_PROXY_PORT) {
      logger.debug(
-        `synthMicrosoft: using proxy ${process.env.JAMBONES_HTTP_PROXY_IP}:${process.env.JAMBONES_HTTP_PROXY_PORT}`);
-      speechConfig.setProxy(process.env.JAMBONES_HTTP_PROXY_IP, process.env.JAMBONES_HTTP_PROXY_PORT);
+        `synthMicrosoft: using proxy ${JAMBONES_HTTP_PROXY_IP}:${JAMBONES_HTTP_PROXY_PORT}`);
+      speechConfig.setProxy(JAMBONES_HTTP_PROXY_IP, JAMBONES_HTTP_PROXY_PORT);
    }
    const synthesizer = new SpeechSynthesizer(speechConfig);

@@ -683,7 +672,7 @@ const synthElevenlabs = async(logger, {
  const opts = !!options && Object.keys(options).length !== 0 ? options : JSON.parse(credOpts || '{}');

  /* default to using the streaming interface, unless disabled by env var OR we want just a cache file */
-  if (!process.env.JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) {
+  if (!JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) {
    let params = '';
    params += `{api_key=${api_key}`;
    params += ',vendor=elevenlabs';
@@ -736,7 +725,7 @@ const synthPlayHT = async(logger, {
  const opts = !!options && Object.keys(options).length !== 0 ? options : JSON.parse(credOpts || '{}');

  /* default to using the streaming interface, unless disabled by env var OR we want just a cache file */
-  if (!process.env.JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) {
+  if (!JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) {
    let params = '';
    params += `{api_key=${api_key}`;
    params += `,user_id=${user_id}`;
@@ -791,7 +780,7 @@ const synthRimelabs = async(logger, {
  const opts = !!options && Object.keys(options).length !== 0 ? options : JSON.parse(credOpts || '{}');

  /* default to using the streaming interface, unless disabled by env var OR we want just a cache file */
-  if (!process.env.JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) {
+  if (!JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) {
    let params = '';
    params += `{api_key=${api_key}`;
    params += `,model_id=${model_id}`;
@@ -873,7 +862,7 @@ const synthVerbio = async(client, logger, {credentials, stats, voice, text, rend
 const synthWhisper = async(logger, {credentials, stats, voice, text, renderForCaching, disableTtsStreaming}) => {
  const {api_key, model_id, baseURL, timeout, speed} = credentials;
  /* if the env is set to stream then bag out, unless we are specifically rendering to generate a cache file */
-  if (!process.env.JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) {
+  if (!JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) {
    let params = '';
    params += `{api_key=${api_key}`;
    params += `,model_id=${model_id}`;
@@ -912,7 +901,7 @@ const synthWhisper = async(logger, {credentials, stats, voice, text, renderForCa

 const synthDeepgram = async(logger, {credentials, stats, model, text, renderForCaching, disableTtsStreaming}) => {
  const {api_key} = credentials;
-  if (!process.env.JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) {
+  if (!JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) {
    let params = '';
    params += `{api_key=${api_key}`;
    params += ',vendor=deepgram';
--- a/lib/utils.js
+++ b/lib/utils.js
@@ -6,7 +6,7 @@ const pool = new Pool('https://auth.crt.nuance.com');
 const NUANCE_AUTH_ENDPOINT = 'tts.api.nuance.com:443';
 const grpc = require('@grpc/grpc-js');
 const formurlencoded = require('form-urlencoded');
-const { HTTP_TIMEOUT } = require('./constants');
+const { JAMBONES_DISABLE_TTS_STREAMING, JAMBONES_TTS_TRIM_SILENCE, TMP_FOLDER, HTTP_TIMEOUT } = require('./config');

 const debug = require('debug')('jambonz:realtimedb-helpers');
 /**
@@ -19,7 +19,44 @@ const debug = require('debug')('jambonz:realtimedb-helpers');
 function makeSynthKey({account_sid = '', vendor, language, voice, engine = '', text}) {
  const hash = crypto.createHash('sha1');
  hash.update(`${language}:${vendor}:${voice}:${engine}:${text}`);
-  return `tts${account_sid ? (':' + account_sid) : ''}:${hash.digest('hex')}`;
+  const hexHashKey = hash.digest('hex');
+  const accountKey = account_sid ? `:${account_sid}` : '';
+  const namespace = vendor.startsWith('custom') ? vendor : getFileExtension(vendor);
+  const key = `tts${accountKey}:${namespace}:${hexHashKey}`;
+  return key;
+}
+
+function makeFilePath(vendor, key, salt = '') {
+  const extension = getFileExtension(vendor);
+  return `${TMP_FOLDER}/${key.replace('tts:', `tts-${salt}`)}.${extension}`;
+}
+
+function getFileExtension(vendor) {
+  const mp3Extension = 'mp3';
+  const r8Extension = 'r8';
+
+  switch (vendor) {
+    case 'azure':
+    case 'microsoft':
+      if (!JAMBONES_DISABLE_TTS_STREAMING || JAMBONES_TTS_TRIM_SILENCE) {
+        return r8Extension;
+      } else {
+        return mp3Extension;
+      }
+    case 'deepgram':
+    case 'elevenlabs':
+    case 'rimlabs':
+      if (!JAMBONES_DISABLE_TTS_STREAMING) {
+        return r8Extension;
+      } else {
+        return mp3Extension;
+      }
+    case 'nuance':
+    case 'nvidia':
+      return r8Extension;
+    default:
+      return mp3Extension;
+  }
 }

 const noopLogger = {
@@ -130,5 +167,6 @@ module.exports = {
  createRivaClient,
  makeBasicAuthHeader,
  NUANCE_AUTH_ENDPOINT,
-  noopLogger
+  noopLogger,
+  makeFilePath
 };
--- a/package.json
+++ b/package.json
@@ -12,6 +12,7 @@
    "test": "NODE_ENV=test node test/ ",
    "coverage": "nyc --reporter html --report-dir ./coverage npm run test",
    "jslint": "eslint index.js lib",
+    "jslint:fix": "eslint --fix '**/*.js'",
    "build": "./build_stubs.sh"
  },
  "repository": {
--- a/stubs/nuance/synthesizer_grpc_pb.js
+++ b/stubs/nuance/synthesizer_grpc_pb.js
@@ -62,9 +62,9 @@ function deserialize_nuance_tts_v1_UnarySynthesisResponse(buffer_arg) {

 //
 // The Synthesizer service offers these functionalities:
-// - GetVoices: Queries the list of available voices, with filters to reduce the search space.  
-// - Synthesize: Synthesizes audio from input text and parameters, and returns an audio stream. 
-// - UnarySynthesize: Synthesizes audio from input text and parameters, and returns a single audio response. 
+// - GetVoices: Queries the list of available voices, with filters to reduce the search space.
+// - Synthesize: Synthesizes audio from input text and parameters, and returns an audio stream.
+// - UnarySynthesize: Synthesizes audio from input text and parameters, and returns a single audio response.
 var SynthesizerService = exports.SynthesizerService = {
  getVoices: {
    path: '/nuance.tts.v1.Synthesizer/GetVoices',
--- a/stubs/riva/proto/riva_audio_grpc_pb.js
+++ b/stubs/riva/proto/riva_audio_grpc_pb.js
@@ -1 +1 @@
-// GENERATED CODE -- NO SERVICES IN PROTO
+// GENERATED CODE -- NO SERVICES IN PROTO
--- a/stubs/riva/proto/riva_tts_grpc_pb.js
+++ b/stubs/riva/proto/riva_tts_grpc_pb.js
@@ -57,7 +57,7 @@ function deserialize_nvidia_riva_tts_SynthesizeSpeechResponse(buffer_arg) {
 var RivaSpeechSynthesisService = exports.RivaSpeechSynthesisService = {
  // Used to request text-to-speech from the service. Submit a request containing the
 // desired text and configuration, and receive audio bytes in the requested format.
-synthesize: {
+  synthesize: {
    path: '/nvidia.riva.tts.RivaSpeechSynthesis/Synthesize',
    requestStream: false,
    responseStream: false,
@@ -69,9 +69,9 @@ synthesize: {
    responseDeserialize: deserialize_nvidia_riva_tts_SynthesizeSpeechResponse,
  },
  // Used to request text-to-speech returned via stream as it becomes available.
-// Submit a SynthesizeSpeechRequest with desired text and configuration,
-// and receive stream of bytes in the requested format.
-synthesizeOnline: {
+  // Submit a SynthesizeSpeechRequest with desired text and configuration,
+  // and receive stream of bytes in the requested format.
+  synthesizeOnline: {
    path: '/nvidia.riva.tts.RivaSpeechSynthesis/SynthesizeOnline',
    requestStream: false,
    responseStream: true,
@@ -83,7 +83,7 @@ synthesizeOnline: {
    responseDeserialize: deserialize_nvidia_riva_tts_SynthesizeSpeechResponse,
  },
  // Enables clients to request the configuration of the current Synthesize service, or a specific model within the service.
-getRivaSynthesisConfig: {
+  getRivaSynthesisConfig: {
    path: '/nvidia.riva.tts.RivaSpeechSynthesis/GetRivaSynthesisConfig',
    requestStream: false,
    responseStream: false,
--- a/test/synth.js
+++ b/test/synth.js
@@ -789,8 +789,19 @@ test('TTS Cache tests', async(t) => {
    t.ok(error, `error returned when specified key was not found`);

    // make sure other tts keys are still there
-    const cached = (await client.keys('tts:*')).length;
-    t.ok(cached >= 1, `successfully kept all non-specified tts records in cache`);
+    const cached = await client.keys('tts:*')
+    t.ok(cached.length >= 1, `successfully kept all non-specified tts records in cache`);
+
+    // retrieve keys from cache and check the key contains the file extension    
+    let key = cached[0];
+    t.ok(key.includes('mp3'), `tts cache extension shoult be part of the key and equal mp3`);
+
+    process.env.VG_TRIM_TTS_SILENCE = 'true';    
+    await client.set(makeSynthKey({ vendor: 'azure' }), 'value');
+
+    const r8Keys = await client.keys('tts:r8*');
+    key = r8Keys[0];
+    t.ok(key.includes('r8'), `tts cache extension shoult be part of the key and equal r8`);

  } catch (err) {
    console.error(JSON.stringify(err));