Improve handling of TTS cache by adding the file extension to the cache key

This commit is contained in:
Markus Frindt
2024-05-24 14:03:31 +02:00
parent acb2d0c7ce
commit cb6ab2479f
8 changed files with 91 additions and 39 deletions

View File

@@ -1,6 +1,7 @@
const fs = require('fs/promises'); const fs = require('fs/promises');
const {noopLogger, makeSynthKey} = require('./utils'); const {noopLogger, makeSynthKey} = require('./utils');
const EXPIRES = (process.env.JAMBONES_TTS_CACHE_DURATION_MINS || 4 * 60) * 60; // cache tts for 4 hours const {JAMBONES_TTS_CACHE_DURATION_MINS} = require('./config');
const EXPIRES = JAMBONES_TTS_CACHE_DURATION_MINS;
async function addFileToCache(client, logger, path, async function addFileToCache(client, logger, path,
{account_sid, vendor, language, voice, deploymentId, engine, text}) { {account_sid, vendor, language, voice, deploymentId, engine, text}) {

17
lib/config.js Normal file
View File

@@ -0,0 +1,17 @@
const JAMBONES_TTS_TRIM_SILENCE = process.env.JAMBONES_TTS_TRIM_SILENCE;
const JAMBONES_DISABLE_TTS_STREAMING = process.env.JAMBONES_DISABLE_TTS_STREAMING;
const JAMBONES_HTTP_PROXY_IP = process.env.JAMBONES_HTTP_PROXY_IP;
const JAMBONES_HTTP_PROXY_PORT = process.env.JAMBONES_HTTP_PROXY_PORT;
const JAMBONES_TTS_CACHE_DURATION_MINS = (parseInt(process.env.JAMBONES_TTS_CACHE_DURATION_MINS) || 4 * 60) * 60; // cache tts for 4 hours
const TMP_FOLDER = '/tmp';
module.exports = {
JAMBONES_TTS_TRIM_SILENCE,
JAMBONES_DISABLE_TTS_STREAMING,
JAMBONES_HTTP_PROXY_IP,
JAMBONES_HTTP_PROXY_PORT,
JAMBONES_TTS_CACHE_DURATION_MINS,
TMP_FOLDER
};

View File

@@ -19,7 +19,8 @@ const {
createNuanceClient, createNuanceClient,
createKryptonClient, createKryptonClient,
createRivaClient, createRivaClient,
noopLogger noopLogger,
makeFilePath
} = require('./utils'); } = require('./utils');
const getNuanceAccessToken = require('./get-nuance-access-token'); const getNuanceAccessToken = require('./get-nuance-access-token');
const { const {
@@ -36,8 +37,13 @@ const {
const {SynthesizeSpeechRequest} = require('../stubs/riva/proto/riva_tts_pb'); const {SynthesizeSpeechRequest} = require('../stubs/riva/proto/riva_tts_pb');
const {AudioEncoding} = require('../stubs/riva/proto/riva_audio_pb'); const {AudioEncoding} = require('../stubs/riva/proto/riva_audio_pb');
const debug = require('debug')('jambonz:realtimedb-helpers'); const debug = require('debug')('jambonz:realtimedb-helpers');
const EXPIRES = (process.env.JAMBONES_TTS_CACHE_DURATION_MINS || 4 * 60) * 60; // cache tts for 4 hours const {
const TMP_FOLDER = '/tmp'; JAMBONES_DISABLE_TTS_STREAMING,
JAMBONES_HTTP_PROXY_IP,
JAMBONES_HTTP_PROXY_PORT,
JAMBONES_TTS_CACHE_DURATION_MINS,
} = require('./config');
const EXPIRES = JAMBONES_TTS_CACHE_DURATION_MINS;
const OpenAI = require('openai'); const OpenAI = require('openai');
const getAwsAuthToken = require('./get-aws-sts-token'); const getAwsAuthToken = require('./get-aws-sts-token');
@@ -149,19 +155,7 @@ async function synthAudio(client, createHash, retrieveHash, logger, stats, { acc
text text
}); });
let filePath; let filePath;
if (['nuance', 'nvidia'].includes(vendor) || filePath = makeFilePath(vendor, key, salt);
(
(process.env.JAMBONES_TTS_TRIM_SILENCE || !process.env.JAMBONES_DISABLE_TTS_STREAMING) &&
['microsoft', 'azure'].includes(vendor)
) ||
(
!process.env.JAMBONES_DISABLE_TTS_STREAMING &&
['elevenlabs', 'deepgram', 'rimelabs'].includes(vendor)
)
) {
filePath = `${TMP_FOLDER}/${key.replace('tts:', `tts-${salt || ''}`)}.r8`;
}
else filePath = `${TMP_FOLDER}/${key.replace('tts:', `tts-${salt || ''}`)}.mp3`;
debug(`synth key is ${key}`); debug(`synth key is ${key}`);
let cached; let cached;
if (!disableTtsCache) { if (!disableTtsCache) {
@@ -444,7 +438,7 @@ const synthMicrosoft = async(logger, {
content = `<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xmlns:mstts="https://www.w3.org/2001/mstts" xml:lang="${language}"><voice name="${voice}">${words}</voice></speak>`; content = `<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xmlns:mstts="https://www.w3.org/2001/mstts" xml:lang="${language}"><voice name="${voice}">${words}</voice></speak>`;
logger.info({content}, 'synthMicrosoft'); logger.info({content}, 'synthMicrosoft');
} }
if (!process.env.JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) { if (!JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) {
let params = ''; let params = '';
params += `{api_key=${apiKey}`; params += `{api_key=${apiKey}`;
params += `,language=${language}`; params += `,language=${language}`;
@@ -454,8 +448,8 @@ const synthMicrosoft = async(logger, {
if (region) params += `,region=${region}`; if (region) params += `,region=${region}`;
if (custom_tts_endpoint) params += `,endpointId=${custom_tts_endpoint}`; if (custom_tts_endpoint) params += `,endpointId=${custom_tts_endpoint}`;
if (custom_tts_endpoint_url) params += `,endpoint=${custom_tts_endpoint_url}`; if (custom_tts_endpoint_url) params += `,endpoint=${custom_tts_endpoint_url}`;
if (process.env.JAMBONES_HTTP_PROXY_IP) params += `,http_proxy_ip=${process.env.JAMBONES_HTTP_PROXY_IP}`; if (JAMBONES_HTTP_PROXY_IP) params += `,http_proxy_ip=${JAMBONES_HTTP_PROXY_IP}`;
if (process.env.JAMBONES_HTTP_PROXY_PORT) params += `,http_proxy_port=${process.env.JAMBONES_HTTP_PROXY_PORT}`; if (JAMBONES_HTTP_PROXY_PORT) params += `,http_proxy_port=${JAMBONES_HTTP_PROXY_PORT}`;
params += '}'; params += '}';
return { return {
filePath: `say:${params}${content.replace(/\n/g, ' ')}`, filePath: `say:${params}${content.replace(/\n/g, ' ')}`,
@@ -484,10 +478,10 @@ const synthMicrosoft = async(logger, {
SpeechSynthesisOutputFormat.Raw8Khz16BitMonoPcm : SpeechSynthesisOutputFormat.Raw8Khz16BitMonoPcm :
SpeechSynthesisOutputFormat.Audio16Khz32KBitRateMonoMp3; SpeechSynthesisOutputFormat.Audio16Khz32KBitRateMonoMp3;
if (process.env.JAMBONES_HTTP_PROXY_IP && process.env.JAMBONES_HTTP_PROXY_PORT) { if (JAMBONES_HTTP_PROXY_IP && JAMBONES_HTTP_PROXY_PORT) {
logger.debug( logger.debug(
`synthMicrosoft: using proxy ${process.env.JAMBONES_HTTP_PROXY_IP}:${process.env.JAMBONES_HTTP_PROXY_PORT}`); `synthMicrosoft: using proxy ${JAMBONES_HTTP_PROXY_IP}:${JAMBONES_HTTP_PROXY_PORT}`);
speechConfig.setProxy(process.env.JAMBONES_HTTP_PROXY_IP, process.env.JAMBONES_HTTP_PROXY_PORT); speechConfig.setProxy(JAMBONES_HTTP_PROXY_IP, JAMBONES_HTTP_PROXY_PORT);
} }
const synthesizer = new SpeechSynthesizer(speechConfig); const synthesizer = new SpeechSynthesizer(speechConfig);
@@ -673,7 +667,7 @@ const synthElevenlabs = async(logger, {
const opts = !!options && Object.keys(options).length !== 0 ? options : JSON.parse(credOpts || '{}'); const opts = !!options && Object.keys(options).length !== 0 ? options : JSON.parse(credOpts || '{}');
/* default to using the streaming interface, unless disabled by env var OR we want just a cache file */ /* default to using the streaming interface, unless disabled by env var OR we want just a cache file */
if (!process.env.JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) { if (!JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) {
let params = ''; let params = '';
params += `{api_key=${api_key}`; params += `{api_key=${api_key}`;
params += ',vendor=elevenlabs'; params += ',vendor=elevenlabs';
@@ -726,7 +720,7 @@ const synthPlayHT = async(logger, {
const opts = !!options && Object.keys(options).length !== 0 ? options : JSON.parse(credOpts || '{}'); const opts = !!options && Object.keys(options).length !== 0 ? options : JSON.parse(credOpts || '{}');
/* default to using the streaming interface, unless disabled by env var OR we want just a cache file */ /* default to using the streaming interface, unless disabled by env var OR we want just a cache file */
if (!process.env.JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) { if (!JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) {
let params = ''; let params = '';
params += `{api_key=${api_key}`; params += `{api_key=${api_key}`;
params += `,user_id=${user_id}`; params += `,user_id=${user_id}`;
@@ -781,7 +775,7 @@ const synthRimelabs = async(logger, {
const opts = !!options && Object.keys(options).length !== 0 ? options : JSON.parse(credOpts || '{}'); const opts = !!options && Object.keys(options).length !== 0 ? options : JSON.parse(credOpts || '{}');
/* default to using the streaming interface, unless disabled by env var OR we want just a cache file */ /* default to using the streaming interface, unless disabled by env var OR we want just a cache file */
if (!process.env.JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) { if (!JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) {
let params = ''; let params = '';
params += `{api_key=${api_key}`; params += `{api_key=${api_key}`;
params += `,model_id=${model_id}`; params += `,model_id=${model_id}`;
@@ -823,7 +817,7 @@ const synthRimelabs = async(logger, {
const synthWhisper = async(logger, {credentials, stats, voice, text, renderForCaching, disableTtsStreaming}) => { const synthWhisper = async(logger, {credentials, stats, voice, text, renderForCaching, disableTtsStreaming}) => {
const {api_key, model_id, baseURL, timeout, speed} = credentials; const {api_key, model_id, baseURL, timeout, speed} = credentials;
/* if the env is set to stream then bag out, unless we are specifically rendering to generate a cache file */ /* if the env is set to stream then bag out, unless we are specifically rendering to generate a cache file */
if (!process.env.JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) { if (!JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) {
let params = ''; let params = '';
params += `{api_key=${api_key}`; params += `{api_key=${api_key}`;
params += `,model_id=${model_id}`; params += `,model_id=${model_id}`;
@@ -862,7 +856,7 @@ const synthWhisper = async(logger, {credentials, stats, voice, text, renderForCa
const synthDeepgram = async(logger, {credentials, stats, model, text, renderForCaching, disableTtsStreaming}) => { const synthDeepgram = async(logger, {credentials, stats, model, text, renderForCaching, disableTtsStreaming}) => {
const {api_key} = credentials; const {api_key} = credentials;
if (!process.env.JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) { if (!JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) {
let params = ''; let params = '';
params += `{api_key=${api_key}`; params += `{api_key=${api_key}`;
params += ',vendor=deepgram'; params += ',vendor=deepgram';

View File

@@ -7,6 +7,7 @@ const HTTP_TIMEOUT = 5000;
const NUANCE_AUTH_ENDPOINT = 'tts.api.nuance.com:443'; const NUANCE_AUTH_ENDPOINT = 'tts.api.nuance.com:443';
const grpc = require('@grpc/grpc-js'); const grpc = require('@grpc/grpc-js');
const formurlencoded = require('form-urlencoded'); const formurlencoded = require('form-urlencoded');
const { JAMBONES_DISABLE_TTS_STREAMING, JAMBONES_TTS_TRIM_SILENCE, TMP_FOLDER } = require('./config');
const debug = require('debug')('jambonz:realtimedb-helpers'); const debug = require('debug')('jambonz:realtimedb-helpers');
/** /**
@@ -19,7 +20,44 @@ const debug = require('debug')('jambonz:realtimedb-helpers');
function makeSynthKey({account_sid = '', vendor, language, voice, engine = '', text}) { function makeSynthKey({account_sid = '', vendor, language, voice, engine = '', text}) {
const hash = crypto.createHash('sha1'); const hash = crypto.createHash('sha1');
hash.update(`${language}:${vendor}:${voice}:${engine}:${text}`); hash.update(`${language}:${vendor}:${voice}:${engine}:${text}`);
return `tts${account_sid ? (':' + account_sid) : ''}:${hash.digest('hex')}`; const hexHashKey = hash.digest('hex');
const accountKey = account_sid ? `:${account_sid}` : '';
const extension = getFileExtension(vendor);
const key = `tts${accountKey}:${extension}:${hexHashKey}`;
return key;
}
function makeFilePath(vendor, key, salt = '') {
const extension = getFileExtension(vendor);
return `${TMP_FOLDER}/${key.replace('tts:', `tts-${salt}`)}.${extension}`;
}
function getFileExtension(vendor) {
const mp3Extension = 'mp3';
const r8Extension = 'r8';
switch (vendor) {
case 'azure':
case 'microsoft':
if (!JAMBONES_DISABLE_TTS_STREAMING || JAMBONES_TTS_TRIM_SILENCE) {
return r8Extension;
} else {
return mp3Extension;
}
case 'deepgram':
case 'elevenlabs':
case 'rimlabs':
if (!JAMBONES_DISABLE_TTS_STREAMING) {
return r8Extension;
} else {
return mp3Extension;
}
case 'nuance':
case 'nvidia':
return r8Extension;
default:
return mp3Extension;
}
} }
const noopLogger = { const noopLogger = {
@@ -123,5 +161,6 @@ module.exports = {
createRivaClient, createRivaClient,
makeBasicAuthHeader, makeBasicAuthHeader,
NUANCE_AUTH_ENDPOINT, NUANCE_AUTH_ENDPOINT,
noopLogger noopLogger,
makeFilePath
}; };

View File

@@ -12,6 +12,7 @@
"test": "NODE_ENV=test node test/ ", "test": "NODE_ENV=test node test/ ",
"coverage": "nyc --reporter html --report-dir ./coverage npm run test", "coverage": "nyc --reporter html --report-dir ./coverage npm run test",
"jslint": "eslint index.js lib", "jslint": "eslint index.js lib",
"jslint:fix": "eslint --fix '**/*.js'",
"build": "./build_stubs.sh" "build": "./build_stubs.sh"
}, },
"repository": { "repository": {

View File

@@ -62,9 +62,9 @@ function deserialize_nuance_tts_v1_UnarySynthesisResponse(buffer_arg) {
// //
// The Synthesizer service offers these functionalities: // The Synthesizer service offers these functionalities:
// - GetVoices: Queries the list of available voices, with filters to reduce the search space. // - GetVoices: Queries the list of available voices, with filters to reduce the search space.
// - Synthesize: Synthesizes audio from input text and parameters, and returns an audio stream. // - Synthesize: Synthesizes audio from input text and parameters, and returns an audio stream.
// - UnarySynthesize: Synthesizes audio from input text and parameters, and returns a single audio response. // - UnarySynthesize: Synthesizes audio from input text and parameters, and returns a single audio response.
var SynthesizerService = exports.SynthesizerService = { var SynthesizerService = exports.SynthesizerService = {
getVoices: { getVoices: {
path: '/nuance.tts.v1.Synthesizer/GetVoices', path: '/nuance.tts.v1.Synthesizer/GetVoices',

View File

@@ -1 +1 @@
// GENERATED CODE -- NO SERVICES IN PROTO // GENERATED CODE -- NO SERVICES IN PROTO

View File

@@ -57,7 +57,7 @@ function deserialize_nvidia_riva_tts_SynthesizeSpeechResponse(buffer_arg) {
var RivaSpeechSynthesisService = exports.RivaSpeechSynthesisService = { var RivaSpeechSynthesisService = exports.RivaSpeechSynthesisService = {
// Used to request text-to-speech from the service. Submit a request containing the // Used to request text-to-speech from the service. Submit a request containing the
// desired text and configuration, and receive audio bytes in the requested format. // desired text and configuration, and receive audio bytes in the requested format.
synthesize: { synthesize: {
path: '/nvidia.riva.tts.RivaSpeechSynthesis/Synthesize', path: '/nvidia.riva.tts.RivaSpeechSynthesis/Synthesize',
requestStream: false, requestStream: false,
responseStream: false, responseStream: false,
@@ -69,9 +69,9 @@ synthesize: {
responseDeserialize: deserialize_nvidia_riva_tts_SynthesizeSpeechResponse, responseDeserialize: deserialize_nvidia_riva_tts_SynthesizeSpeechResponse,
}, },
// Used to request text-to-speech returned via stream as it becomes available. // Used to request text-to-speech returned via stream as it becomes available.
// Submit a SynthesizeSpeechRequest with desired text and configuration, // Submit a SynthesizeSpeechRequest with desired text and configuration,
// and receive stream of bytes in the requested format. // and receive stream of bytes in the requested format.
synthesizeOnline: { synthesizeOnline: {
path: '/nvidia.riva.tts.RivaSpeechSynthesis/SynthesizeOnline', path: '/nvidia.riva.tts.RivaSpeechSynthesis/SynthesizeOnline',
requestStream: false, requestStream: false,
responseStream: true, responseStream: true,
@@ -83,7 +83,7 @@ synthesizeOnline: {
responseDeserialize: deserialize_nvidia_riva_tts_SynthesizeSpeechResponse, responseDeserialize: deserialize_nvidia_riva_tts_SynthesizeSpeechResponse,
}, },
// Enables clients to request the configuration of the current Synthesize service, or a specific model within the service. // Enables clients to request the configuration of the current Synthesize service, or a specific model within the service.
getRivaSynthesisConfig: { getRivaSynthesisConfig: {
path: '/nvidia.riva.tts.RivaSpeechSynthesis/GetRivaSynthesisConfig', path: '/nvidia.riva.tts.RivaSpeechSynthesis/GetRivaSynthesisConfig',
requestStream: false, requestStream: false,
responseStream: false, responseStream: false,