const assert = require('assert'); const fs = require('fs'); const bent = require('bent'); const ttsGoogle = require('@google-cloud/text-to-speech'); const { PollyClient, SynthesizeSpeechCommand } = require('@aws-sdk/client-polly'); const { CartesiaClient } = require('@cartesia/cartesia-js'); const sdk = require('microsoft-cognitiveservices-speech-sdk'); const TextToSpeechV1 = require('ibm-watson/text-to-speech/v1'); const { IamAuthenticator } = require('ibm-watson/auth'); const { ResultReason, SpeechConfig, SpeechSynthesizer, CancellationDetails, SpeechSynthesisOutputFormat } = sdk; const { makeSynthKey, createNuanceClient, createKryptonClient, createRivaClient, noopLogger, makeFilePath, makePlayhtKey } = require('./utils'); const getNuanceAccessToken = require('./get-nuance-access-token'); const getVerbioAccessToken = require('./get-verbio-token'); const { SynthesisRequest, Voice, AudioFormat, AudioParameters, PCM, Input, Text, SSML, EventParameters } = require('../stubs/nuance/synthesizer_pb'); const {SynthesizeSpeechRequest} = require('../stubs/riva/proto/riva_tts_pb'); const {AudioEncoding} = require('../stubs/riva/proto/riva_audio_pb'); const debug = require('debug')('jambonz:realtimedb-helpers'); const { JAMBONES_DISABLE_TTS_STREAMING, JAMBONES_DISABLE_AZURE_TTS_STREAMING, JAMBONES_HTTP_PROXY_IP, JAMBONES_HTTP_PROXY_PORT, JAMBONES_TTS_CACHE_DURATION_MINS, JAMBONES_TTS_TRIM_SILENCE, JAMBONES_AZURE_ENABLE_SSML } = require('./config'); const EXPIRES = JAMBONES_TTS_CACHE_DURATION_MINS; const OpenAI = require('openai'); const getAwsAuthToken = require('./get-aws-sts-token'); const trimTrailingSilence = (buffer) => { assert.ok(buffer instanceof Buffer, 'trimTrailingSilence - argument is not a Buffer'); let offset = buffer.length; while (offset > 0) { // Get 16-bit value from the buffer (read in reverse) const value = buffer.readUInt16BE(offset - 2); if (value !== 0) { break; } offset -= 2; } // Trim the silence from the end return offset === buffer.length ? buffer : buffer.subarray(0, offset); }; /** * Synthesize speech to an mp3 file, and also cache the generated speech * in redis (base64 format) for 24 hours so as to avoid unnecessarily paying * time and again for speech synthesis of the same text. * It is the responsibility of the caller to unlink the mp3 file after use. * * @param {*} client - redis client * @param {*} logger - pino logger * @param {object} opts - options * @param {string} opts.vendor - 'google' or 'aws' ('polly' is an alias for 'aws') * @param {string} opt.language - language code * @param {string} opts.voice - voice identifier * @param {string} opts.text - text or ssml to synthesize * @param {boolean} opts.disableTtsCache - disable TTS Cache retrieval * @returns object containing filepath to an mp3 file in the /tmp folder containing * the synthesized audio, and a variable indicating whether it was served from cache */ async function synthAudio(client, createHash, retrieveHash, logger, stats, { account_sid, vendor, language, voice, gender, text, engine, salt, model, credentials, deploymentId, disableTtsCache, renderForCaching = false, disableTtsStreaming, options, instructions }) { let audioData; let servedFromCache = false; let rtt; logger = logger || noopLogger; assert.ok(['google', 'aws', 'polly', 'microsoft', 'wellsaid', 'nuance', 'nvidia', 'ibm', 'elevenlabs', 'whisper', 'deepgram', 'playht', 'rimelabs', 'verbio', 'cartesia', 'inworld', 'resemble'].includes(vendor) || vendor.startsWith('custom'), `synthAudio supported vendors are google, aws, microsoft, nuance, nvidia and wellsaid ..etc, not ${vendor}`); if ('google' === vendor) { assert.ok(language, 'synthAudio requires language when google is used'); } else if (['aws', 'polly'].includes(vendor)) { assert.ok(voice, 'synthAudio requires voice when aws polly is used'); } else if ('microsoft' === vendor) { assert.ok(language || deploymentId, 'synthAudio requires language when microsoft is used'); assert.ok(voice || deploymentId, 'synthAudio requires voice when microsoft is used'); } else if ('nuance' === vendor) { assert.ok(voice, 'synthAudio requires voice when nuance is used'); if (!credentials.nuance_tts_uri) { assert.ok(credentials.client_id, 'synthAudio requires client_id in credentials when nuance is used'); assert.ok(credentials.secret, 'synthAudio requires client_id in credentials when nuance is used'); } } else if ('nvidia' === vendor) { assert.ok(voice, 'synthAudio requires voice when nvidia is used'); assert.ok(language, 'synthAudio requires language when nvidia is used'); assert.ok(credentials.riva_server_uri, 'synthAudio requires riva_server_uri in credentials when nvidia is used'); } else if ('ibm' === vendor) { assert.ok(voice, 'synthAudio requires voice when ibm is used'); assert.ok(credentials.tts_region, 'synthAudio requires tts_region in credentials when ibm watson is used'); assert.ok(credentials.tts_api_key, 'synthAudio requires tts_api_key in credentials when nuance is used'); } else if ('wellsaid' === vendor) { language = 'en-US'; // WellSaid only supports English atm assert.ok(voice, 'synthAudio requires voice when wellsaid is used'); assert.ok(!text.startsWith(' logger.info(err, 'Error setting expires')); } else { // not found in cache - go get it from speech vendor and add to cache debug('result was NOT found in cache'); stats.increment('tts.cache.requests', ['found:no']); let vendorLabel = vendor; const startAt = process.hrtime(); switch (vendor) { case 'google': audioData = await synthGoogle(logger, {credentials, stats, language, voice, gender, key, text}); break; case 'aws': case 'polly': vendorLabel = 'aws'; audioData = await synthPolly(createHash, retrieveHash, logger, {credentials, stats, language, voice, key, text, engine, renderForCaching, disableTtsStreaming, disableTtsCache}); break; case 'azure': case 'microsoft': vendorLabel = 'microsoft'; audioData = await synthMicrosoft(logger, {credentials, stats, language, voice, key, text, deploymentId, renderForCaching, disableTtsStreaming, disableTtsCache}); break; case 'nuance': model = model || 'enhanced'; audioData = await synthNuance(client, logger, {credentials, stats, voice, model, key, text}); break; case 'nvidia': audioData = await synthNvidia(client, logger, {credentials, stats, language, voice, model, key, text, renderForCaching, disableTtsStreaming, disableTtsCache}); break; case 'ibm': audioData = await synthIbm(logger, {credentials, stats, voice, key, text}); break; case 'wellsaid': audioData = await synthWellSaid(logger, {credentials, stats, language, voice, key, text}); break; case 'elevenlabs': audioData = await synthElevenlabs(logger, { credentials, options, stats, language, voice, key, text, renderForCaching, disableTtsStreaming, disableTtsCache}); break; case 'playht': audioData = await synthPlayHT(client, logger, { credentials, options, stats, language, voice, key, text, renderForCaching, disableTtsStreaming, disableTtsCache}); break; case 'cartesia': audioData = await synthCartesia(logger, { credentials, options, stats, language, voice, key, text, renderForCaching, disableTtsStreaming, disableTtsCache}); break; case 'inworld': audioData = await synthInworld(logger, { credentials, options, stats, language, voice, key, text, renderForCaching, disableTtsStreaming, disableTtsCache}); break; case 'rimelabs': audioData = await synthRimelabs(logger, { credentials, options, stats, language, voice, key, text, renderForCaching, disableTtsStreaming, disableTtsCache}); break; case 'whisper': audioData = await synthWhisper(logger, { credentials, stats, voice, key, text, instructions, renderForCaching, disableTtsStreaming, disableTtsCache}); break; case 'verbio': audioData = await synthVerbio(client, logger, { credentials, stats, voice, key, text, renderForCaching, disableTtsStreaming, disableTtsCache}); if (audioData?.filePath) return audioData; break; case 'deepgram': audioData = await synthDeepgram(logger, {credentials, stats, model, key, text, renderForCaching, disableTtsStreaming, disableTtsCache}); break; case 'resemble': audioData = await synthResemble(logger, { credentials, stats, voice, key, text, options, renderForCaching, disableTtsStreaming, disableTtsCache}); break; case vendor.startsWith('custom') ? vendor : 'cant_match_value': audioData = await synthCustomVendor(logger, {credentials, stats, language, voice, key, text, renderForCaching, disableTtsStreaming, disableTtsCache}); break; default: assert(`synthAudio: unsupported speech vendor ${vendor}`); } if ('filePath' in audioData) return audioData; const diff = process.hrtime(startAt); const time = diff[0] * 1e3 + diff[1] * 1e-6; rtt = time.toFixed(0); stats.histogram('tts.response_time', rtt, [`vendor:${vendorLabel}`]); debug(`tts rtt time for ${text.length} chars on ${vendorLabel}: ${rtt}`); logger.info(`tts rtt time for ${text.length} chars on ${vendorLabel}: ${rtt}`); // Save audio json to cache client.setex(key, EXPIRES, JSON.stringify({ ...audioData, audioContent: audioData.audioContent?.toString('base64') })) .catch((err) => logger.error(err, `error calling setex on key ${key}`)); } return new Promise((resolve, reject) => { const { audioContent, extension } = audioData; const filePath = makeFilePath({ key, salt, extension }); fs.writeFile(filePath, audioContent, (err) => { if (err) return reject(err); resolve({filePath, servedFromCache, rtt}); }); }); } const synthPolly = async(createHash, retrieveHash, logger, {credentials, stats, language, voice, engine, key, text, renderForCaching, disableTtsStreaming, disableTtsCache}) => { const {region, accessKeyId, secretAccessKey, roleArn} = credentials; if (!JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) { let params = '{'; params += `language=${language}`; params += `,playback_id=${key}`; params += `,write_cache_file=${disableTtsCache ? 0 : 1}`; params += ',vendor=aws'; if (accessKeyId && secretAccessKey) { if (accessKeyId) params += `,accessKeyId=${accessKeyId}`; if (secretAccessKey) params += `,secretAccessKey=${secretAccessKey}`; } else if (roleArn) { const cred = await getAwsAuthToken( logger, createHash, retrieveHash, { region, roleArn }); if (cred) { params += `,accessKeyId=${cred.accessKeyId}`; params += `,secretAccessKey=${cred.secretAccessKey}`; params += `,sessionToken=${cred.sessionToken}`; } } if (region) params += `,region=${region}`; if (engine) params += `,engine=${engine}`; params += '}'; return { filePath: `say:${params}${text.replace(/\n/g, ' ').replace(/\r/g, ' ')}`, servedFromCache: false, rtt: 0 }; } try { let polly; if (accessKeyId && secretAccessKey) { polly = new PollyClient({ region, credentials: { accessKeyId, secretAccessKey } }); } else if (roleArn) { polly = new PollyClient({ region, credentials: await getAwsAuthToken( logger, createHash, retrieveHash, { region, roleArn }), }); } else { // AWS RoleArn assigned to Instance profile polly = new PollyClient({region}); } const opts = { Engine: engine, OutputFormat: 'mp3', Text: text, LanguageCode: language, TextType: text.startsWith('') ? 'ssml' : 'text', VoiceId: voice }; const command = new SynthesizeSpeechCommand(opts); const data = await polly.send(command); const chunks = []; return new Promise((resolve, reject) => { data.AudioStream .on('error', (err) => { logger.info({err}, 'synthAudio: Error synthesizing speech using aws polly'); stats.increment('tts.count', ['vendor:aws', 'accepted:no']); reject(err); }) .on('data', (chunk) => { chunks.push(chunk); }) .on('end', () => resolve( { audioContent: Buffer.concat(chunks), extension: 'mp3', sampleRate: 8000 } )); }); } catch (err) { logger.info({err}, 'synthAudio: Error synthesizing speech using aws polly'); stats.increment('tts.count', ['vendor:aws', 'accepted:no']); throw err; } }; const synthGoogle = async(logger, {credentials, stats, language, voice, gender, text}) => { const client = new ttsGoogle.TextToSpeechClient(credentials); // If google custom voice cloning is used. // At this time 31 Oct 2024, google node sdk has not support voice cloning yet. if (typeof voice === 'object' && voice.voice_cloning_key) { try { const accessToken = await client.auth.getAccessToken(); const projectId = await client.getProjectId(); const post = bent('https://texttospeech.googleapis.com', 'POST', 'json', { 'Authorization': `Bearer ${accessToken}`, 'x-goog-user-project': projectId, 'Content-Type': 'application/json; charset=utf-8' }); const payload = { input: { text }, voice: { language_code: language, voice_clone: { voice_cloning_key: voice.voice_cloning_key } }, audioConfig: { // Cloning voice at this time still in v1 beta version, and it support LINEAR16 in Wav format, 24.000Hz audioEncoding: 'LINEAR16', sample_rate_hertz: 24000 } }; const wav = await post('/v1beta1/text:synthesize', payload); return { audioContent: Buffer.from(wav.audioContent, 'base64'), extension: 'wav', sampleRate: 24000 }; } catch (err) { logger.info({err: await err.text()}, 'synthGoogle returned error'); throw err; } } const opts = { voice: { ...(typeof voice === 'string' && {name: voice}), ...(typeof voice === 'object' && {customVoice: voice}), languageCode: language, ssmlGender: gender || 'SSML_VOICE_GENDER_UNSPECIFIED' }, audioConfig: {audioEncoding: 'MP3'} }; Object.assign(opts, {input: text.startsWith('') ? {ssml: text} : {text}}); try { const responses = await client.synthesizeSpeech(opts); stats.increment('tts.count', ['vendor:google', 'accepted:yes']); client.close(); return { audioContent: responses[0].audioContent, extension: 'mp3', sampleRate: 8000 }; } catch (err) { console.error(err); logger.info({err, opts}, 'synthAudio: Error synthesizing speech using google'); stats.increment('tts.count', ['vendor:google', 'accepted:no']); client && client.close(); throw err; } }; const synthIbm = async(logger, {credentials, stats, voice, text}) => { const {tts_api_key, tts_region} = credentials; const params = { text, voice, accept: 'audio/mp3' }; try { const textToSpeech = new TextToSpeechV1({ authenticator: new IamAuthenticator({ apikey: tts_api_key, }), serviceUrl: `https://api.${tts_region}.text-to-speech.watson.cloud.ibm.com` }); const r = await textToSpeech.synthesize(params); const chunks = []; for await (const chunk of r.result) { chunks.push(chunk); } return { audioContent: Buffer.concat(chunks), extension: 'mp3', sampleRate: 8000 }; } catch (err) { logger.info({err, params}, 'synthAudio: Error synthesizing speech using ibm'); stats.increment('tts.count', ['vendor:ibm', 'accepted:no']); throw new Error(err.statusText || err.message); } }; async function _synthOnPremMicrosoft(logger, { credentials, language, voice, text }) { const {use_custom_tts, custom_tts_endpoint_url, api_key} = credentials; let content = text; if (use_custom_tts && !content.startsWith('${text}`; } if (content.startsWith('')) { /* microsoft enforces some properties and uses voice xml element so if the user did not supply do it for them */ const words = content.slice(7, -8).trim().replace(/(\r\n|\n|\r)/gm, ' '); // eslint-disable-next-line max-len content = `${words}`; logger.info({content}, 'synthMicrosoft'); } else if (JAMBONES_AZURE_ENABLE_SSML && !content.startsWith('${text}`; } try { const trimSilence = JAMBONES_TTS_TRIM_SILENCE; const post = bent('POST', 'buffer', { 'X-Microsoft-OutputFormat': trimSilence ? 'raw-8khz-16bit-mono-pcm' : 'audio-16khz-32kbitrate-mono-mp3', 'Content-Type': 'application/ssml+xml', 'User-Agent': 'Jambonz', ...(api_key && {'Ocp-Apim-Subscription-Key': api_key}) }); const audioContent = await post(custom_tts_endpoint_url, content); return { audioContent, extension: trimSilence ? 'r8' : 'mp3', sampleRate: 8000 }; } catch (err) { logger.info({err}, '_synthMicrosoftByHttp returned error'); throw err; } } const synthMicrosoft = async(logger, { credentials, stats, language, voice, key, text, renderForCaching, disableTtsStreaming, disableTtsCache }) => { try { const {api_key: apiKey, region, use_custom_tts, custom_tts_endpoint, custom_tts_endpoint_url} = credentials; // let clean up the text let content = text; if (use_custom_tts && !content.startsWith('${text}`; } if (content.startsWith('')) { /* microsoft enforces some properties and uses voice xml element so if the user did not supply do it for them */ const words = content.slice(7, -8).trim().replace(/(\r\n|\n|\r)/gm, ' '); // eslint-disable-next-line max-len content = `${words}`; logger.info({content}, 'synthMicrosoft'); } else if (JAMBONES_AZURE_ENABLE_SSML && !content.startsWith('${text}`; } if (!JAMBONES_DISABLE_TTS_STREAMING && !JAMBONES_DISABLE_AZURE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) { let params = '{'; params += `api_key=${apiKey}`; params += `,playback_id=${key}`; params += `,language=${language}`; params += ',vendor=microsoft'; params += `,voice=${voice}`; params += `,write_cache_file=${disableTtsCache ? 0 : 1}`; if (region) params += `,region=${region}`; if (custom_tts_endpoint) params += `,endpointId=${custom_tts_endpoint}`; if (custom_tts_endpoint_url) params += `,endpoint=${custom_tts_endpoint_url}`; if (JAMBONES_HTTP_PROXY_IP) params += `,http_proxy_ip=${JAMBONES_HTTP_PROXY_IP}`; if (JAMBONES_HTTP_PROXY_PORT) params += `,http_proxy_port=${JAMBONES_HTTP_PROXY_PORT}`; params += '}'; return { filePath: `say:${params}${content.replace(/\n/g, ' ')}`, servedFromCache: false, rtt: 0 }; } // Azure Onprem if (use_custom_tts && custom_tts_endpoint_url) { return await _synthOnPremMicrosoft(logger, { credentials, stats, language, voice, text }); } // Azure hosted service const trimSilence = JAMBONES_TTS_TRIM_SILENCE; const speechConfig = SpeechConfig.fromSubscription(apiKey, region); speechConfig.speechSynthesisLanguage = language; speechConfig.speechSynthesisVoiceName = voice; if (use_custom_tts && custom_tts_endpoint) { speechConfig.endpointId = custom_tts_endpoint; } speechConfig.speechSynthesisOutputFormat = trimSilence ? SpeechSynthesisOutputFormat.Raw8Khz16BitMonoPcm : SpeechSynthesisOutputFormat.Audio16Khz32KBitRateMonoMp3; if (JAMBONES_HTTP_PROXY_IP && JAMBONES_HTTP_PROXY_PORT) { logger.debug( `synthMicrosoft: using proxy ${JAMBONES_HTTP_PROXY_IP}:${JAMBONES_HTTP_PROXY_PORT}`); speechConfig.setProxy(JAMBONES_HTTP_PROXY_IP, JAMBONES_HTTP_PROXY_PORT); } const synthesizer = new SpeechSynthesizer(speechConfig); return new Promise((resolve, reject) => { const speakAsync = content.startsWith(' { switch (result.reason) { case ResultReason.Canceled: const cancellation = CancellationDetails.fromResult(result); logger.info({reason: cancellation.errorDetails}, 'synthAudio: (Microsoft) synthesis canceled'); synthesizer.close(); reject(cancellation.errorDetails); break; case ResultReason.SynthesizingAudioCompleted: let buffer = Buffer.from(result.audioData); if (trimSilence) buffer = trimTrailingSilence(buffer); resolve({ audioContent: buffer, extension: trimSilence ? 'r8' : 'mp3', sampleRate: 8000 }); synthesizer.close(); stats.increment('tts.count', ['vendor:microsoft', 'accepted:yes']); break; default: logger.info({result}, 'synthAudio: (Microsoft) unexpected result'); break; } }, (err) => { logger.info({err}, 'synthAudio: (Microsoft) error synthesizing'); stats.increment('tts.count', ['vendor:microsoft', 'accepted:no']); synthesizer.close(); reject(err); }); }); } catch (err) { logger.info({err}, 'synthAudio: Error synthesizing speech using Microsoft'); stats.increment('tts.count', ['vendor:google', 'accepted:no']); } }; const synthWellSaid = async(logger, {credentials, stats, language, voice, gender, text}) => { const {api_key} = credentials; try { const post = bent('https://api.wellsaidlabs.com', 'POST', 'buffer', { 'X-Api-Key': api_key, 'Accept': 'audio/mpeg', 'Content-Type': 'application/json' }); const audioContent = await post('/v1/tts/stream', { text, speaker_id: voice }); return { audioContent, extension: 'mp3', sampleRate: 8000 }; } catch (err) { logger.info({err}, 'testWellSaidTts returned error'); throw err; } }; const synthNuance = async(client, logger, {credentials, stats, voice, model, text}) => { let nuanceClient; const {client_id, secret, nuance_tts_uri} = credentials; if (nuance_tts_uri) { nuanceClient = await createKryptonClient(nuance_tts_uri); } else { /* get a nuance access token */ const {access_token} = await getNuanceAccessToken(client, logger, client_id, secret, 'tts'); nuanceClient = await createNuanceClient(access_token); } const v = new Voice(); const p = new AudioParameters(); const f = new AudioFormat(); const pcm = new PCM(); const params = new EventParameters(); const request = new SynthesisRequest(); const input = new Input(); if (text.startsWith(' { nuanceClient.unarySynthesize(request, (err, response) => { if (err) { console.error(err); return reject(err); } const status = response.getStatus(); const code = status.getCode(); if (code !== 200) { const message = status.getMessage(); const details = status.getDetails(); return reject({code, message, details}); } resolve({ audioContent: Buffer.from(response.getAudio()), extension: 'r8', sampleRate }); }); }); }; const synthNvidia = async(client, logger, { credentials, stats, language, voice, model, key, text, renderForCaching, disableTtsStreaming, disableTtsCache }) => { const {riva_server_uri} = credentials; if (!JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) { let params = ''; params += `{riva_server_uri=${riva_server_uri}`; params += `,playback_id=${key}`; params += `,voice=${voice}`; params += `,language=${language}`; params += `,write_cache_file=${disableTtsCache ? 0 : 1}`; params += '}'; return { filePath: `say:${params}${text.replace(/\n/g, ' ')}`, servedFromCache: false, rtt: 0 }; } let rivaClient, request; const sampleRate = 8000; try { rivaClient = await createRivaClient(riva_server_uri); request = new SynthesizeSpeechRequest(); request.setVoiceName(voice); request.setLanguageCode(language); request.setSampleRateHz(sampleRate); request.setEncoding(AudioEncoding.LINEAR_PCM); request.setText(text); } catch (err) { logger.info({err}, 'error creating riva client'); return Promise.reject(err); } return new Promise((resolve, reject) => { rivaClient.synthesize(request, (err, response) => { if (err) { logger.info({err, voice, language}, 'error synthesizing speech using Nvidia'); return reject(err); } resolve({ audioContent: Buffer.from(response.getAudio()), extension: 'r8', sampleRate }); }); }); }; const synthCustomVendor = async(logger, {credentials, stats, language, voice, text, filePath, renderForCaching, disableTtsStreaming, key, disableTtsCache}) => { const {vendor, auth_token, custom_tts_url} = credentials; if (!JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) { let params = '{'; params += `auth_token=${auth_token}`; params += `,playback_id=${key}`; params += `,custom_tts_url=${custom_tts_url}`; params += ',vendor=custom'; params += `,voice=${voice}`; params += `,write_cache_file=${disableTtsCache ? 0 : 1}`; params += '}'; return { filePath: `say:${params}${text.replace(/\n/g, ' ').replace(/\r/g, ' ')}`, servedFromCache: false, rtt: 0 }; } try { const post = bent('POST', { 'Authorization': `Bearer ${auth_token}`, 'Content-Type': 'application/json' }); const response = await post(custom_tts_url, { language, voice, type: text.startsWith('') ? 'ssml' : 'text', text }); const mime = response.headers['content-type']; const buffer = await response.arrayBuffer(); const [extension, sampleRate] = getFileExtFromMime(mime); return { audioContent: buffer, extension, sampleRate }; } catch (err) { logger.info({err}, `Vendor ${vendor} returned error`); throw err; } }; const synthElevenlabs = async(logger, { credentials, options, stats, voice, key, text, renderForCaching, disableTtsStreaming, disableTtsCache }) => { const {api_key, model_id, api_uri, options: credOpts} = credentials; const opts = !!options && Object.keys(options).length !== 0 ? options : JSON.parse(credOpts || '{}'); /* default to using the streaming interface, unless disabled by env var OR we want just a cache file */ if (!JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) { let params = '{'; params += `api_key=${api_key}`; params += `,playback_id=${key}`; params += ',vendor=elevenlabs'; params += `,voice=${voice}`; params += `,model_id=${model_id}`; params += `,write_cache_file=${disableTtsCache ? 0 : 1}`; if (api_uri) params += `,api_uri=${api_uri}`; if (opts.optimize_streaming_latency !== null && opts.optimize_streaming_latency !== undefined) { params += `,optimize_streaming_latency=${opts.optimize_streaming_latency}`; } if (opts.voice_settings?.similarity_boost) params += `,similarity_boost=${opts.voice_settings.similarity_boost}`; if (opts.voice_settings?.stability) params += `,stability=${opts.voice_settings.stability}`; if (opts.voice_settings?.style) params += `,style=${opts.voice_settings.style}`; if (opts.voice_settings?.speed !== null && opts.voice_settings?.speed !== undefined) params += `,speed=${opts.voice_settings.speed}`; if (opts.voice_settings?.use_speaker_boost === false) params += ',use_speaker_boost=false'; if (opts.previous_text) params += `,previous_text=${opts.previous_text}`; if (opts.next_text) params += `,next_text=${opts.next_text}`; if (opts.pronunciation_dictionary_locators && Array.isArray(opts.pronunciation_dictionary_locators)) params += `,pronunciation_dictionary_locators=${JSON.stringify(opts.pronunciation_dictionary_locators)}`; params += '}'; return { filePath: `say:${params}${text.replace(/\n/g, ' ').replace(/\r/g, ' ')}`, servedFromCache: false, rtt: 0 }; } const optimize_streaming_latency = opts.optimize_streaming_latency ? `?optimize_streaming_latency=${opts.optimize_streaming_latency}` : ''; try { const post = bent(`https://${api_uri}`, 'POST', 'buffer', { 'xi-api-key': api_key, 'Accept': 'audio/mpeg', 'Content-Type': 'application/json' }); const audioContent = await post(`/v1/text-to-speech/${voice}${optimize_streaming_latency}`, { text, model_id, voice_settings: { stability: 0.5, similarity_boost: 0.5 }, ...opts }); return { audioContent, extension: 'mp3', sampleRate: 8000 }; } catch (err) { logger.info({err}, 'synth Elevenlabs returned error'); stats.increment('tts.count', ['vendor:elevenlabs', 'accepted:no']); throw err; } }; const synthPlayHT = async(client, logger, { credentials, options, stats, voice, language, key, text, renderForCaching, disableTtsStreaming, disableTtsCache }) => { const {api_key, user_id, voice_engine, playht_tts_uri, options: credOpts} = credentials; const opts = !!options && Object.keys(options).length !== 0 ? options : JSON.parse(credOpts || '{}'); let synthesizeUrl = playht_tts_uri ? `${playht_tts_uri}/api/v2/tts/stream` : 'https://api.play.ht/api/v2/tts/stream'; // If model is play3.0, the synthesizeUrl is got from authentication endpoint if (voice_engine === 'Play3.0') { try { const post = bent('https://api.play.ht', 'POST', 'json', 201, { 'AUTHORIZATION': api_key, 'X-USER-ID': user_id, 'Accept': 'application/json' }); const key = makePlayhtKey(api_key); const url = await client.get(key); if (!url) { const {inference_address, expires_at_ms} = await post('/api/v3/auth'); synthesizeUrl = inference_address; const expiry = Math.floor((expires_at_ms - Date.now()) / 1000 - 30); await client.set(key, inference_address, 'EX', expiry); } else { // Use cached URL synthesizeUrl = url; } } catch (err) { logger.info({err}, 'synth PlayHT returned error for authentication version 3.0'); stats.increment('tts.count', ['vendor:playht', 'accepted:no']); throw err; } } /* default to using the streaming interface, unless disabled by env var OR we want just a cache file */ if (!JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) { let params = '{'; params += `api_key=${api_key}`; params += `,playback_id=${key}`; params += `,user_id=${user_id}`; params += ',vendor=playht'; params += `,voice=${voice}`; params += `,voice_engine=${voice_engine}`; params += `,synthesize_url=${synthesizeUrl}`; params += `,write_cache_file=${disableTtsCache ? 0 : 1}`; params += `,language=${language}`; if (opts.quality) params += `,quality=${opts.quality}`; if (opts.speed) params += `,speed=${opts.speed}`; if (opts.seed) params += `,style=${opts.seed}`; if (opts.temperature) params += `,temperature=${opts.temperature}`; if (opts.emotion) params += `,emotion=${opts.emotion}`; if (opts.voice_guidance) params += `,voice_guidance=${opts.voice_guidance}`; if (opts.style_guidance) params += `,style_guidance=${opts.style_guidance}`; if (opts.text_guidance) params += `,text_guidance=${opts.text_guidance}`; if (opts.top_p) params += `,top_p=${opts.top_p}`; if (opts.repetition_penalty) params += `,repetition_penalty=${opts.repetition_penalty}`; params += '}'; return { filePath: `say:${params}${text.replace(/\n/g, ' ').replace(/\r/g, ' ')}`, servedFromCache: false, rtt: 0 }; } try { const post = bent('POST', 'buffer', { ...(voice_engine !== 'Play3.0' && { 'AUTHORIZATION': api_key, 'X-USER-ID': user_id, }), 'Accept': 'audio/mpeg', 'Content-Type': 'application/json' }); const audioContent = await post(synthesizeUrl, { text, ...(voice_engine === 'Play3.0' && { language }), voice, voice_engine, output_format: 'mp3', sample_rate: 8000, ...opts }); return { audioContent, extension: 'mp3', sampleRate: 8000 }; } catch (err) { logger.info({err}, 'synth PlayHT returned error'); stats.increment('tts.count', ['vendor:playht', 'accepted:no']); throw err; } }; const synthInworld = async(logger, { credentials, options, stats, voice, key, text, renderForCaching, disableTtsStreaming, disableTtsCache }) => { const {api_key, model_id, options: credOpts} = credentials; const opts = !!options && Object.keys(options).length !== 0 ? options : JSON.parse(credOpts || '{}'); /* default to using the streaming interface, unless disabled by env var OR we want just a cache file */ if (!JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) { let params = '{'; params += `api_key=${api_key}`; params += `,playback_id=${key}`; params += `,model_id=${model_id}`; params += ',vendor=inworld'; params += `,voice=${voice}`; params += `,write_cache_file=${disableTtsCache ? 0 : 1}`; if (opts.temperature) params += `,temperature=${opts.temperature}`; if (opts.audioConfig?.pitch) params += `,pitch=${opts.pitch}`; if (opts.audioConfig?.speakingRate) params += `,speakingRate=${opts.speakingRate}`; params += '}'; return { filePath: `say:${params}${text.replace(/\n/g, ' ').replace(/\r/g, ' ')}`, servedFromCache: false, rtt: 0 }; } try { const url = 'https://api.inworld.ai/tts/v1/voice'; const sampleRate = 8000; const options = { method: 'POST', headers: { 'Authorization': `Basic ${api_key}`, 'Content-Type': 'application/json' }, body: JSON.stringify({ text, voiceId: voice, modelId: model_id, audioConfig: { ...(opts.audioConfig || {}), audioEncoding: 'MP3', } }) }; const response = await fetch(url, options); if (!response.ok) { throw new Error(await response.text()); } const json = await response.json(); return { audioContent: Buffer.from(json.audioContent, 'base64'), extension: 'mp3', sampleRate }; } catch (err) { logger.info({err}, 'synth inworld returned error'); stats.increment('tts.count', ['vendor:inworld', 'accepted:no']); throw err; } }; const synthRimelabs = async(logger, { credentials, options, stats, language, voice, key, text, renderForCaching, disableTtsStreaming, disableTtsCache }) => { const {api_key, model_id, options: credOpts} = credentials; const opts = !!options && Object.keys(options).length !== 0 ? options : JSON.parse(credOpts || '{}'); /* default to using the streaming interface, unless disabled by env var OR we want just a cache file */ if (!JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) { let params = '{'; params += `api_key=${api_key}`; params += `,playback_id=${key}`; params += `,model_id=${model_id}`; params += ',vendor=rimelabs'; params += `,language=${language}`; params += `,voice=${voice}`; params += `,write_cache_file=${disableTtsCache ? 0 : 1}`; if (opts.speedAlpha) params += `,speed_alpha=${opts.speedAlpha}`; if (opts.reduceLatency) params += `,reduce_latency=${opts.reduceLatency}`; // Arcana model parameters if (opts.temperature) params += `,temperature=${opts.temperature}`; if (opts.repetition_penalty) params += `,repetition_penalty=${opts.repetition_penalty}`; if (opts.top_p) params += `,top_p=${opts.top_p}`; if (opts.max_tokens) params += `,max_tokens=${opts.max_tokens}`; params += '}'; return { filePath: `say:${params}${text.replace(/\n/g, ' ').replace(/\r/g, ' ')}`, servedFromCache: false, rtt: 0 }; } try { const post = bent('https://users.rime.ai', 'POST', 'buffer', { 'Authorization': `Bearer ${api_key}`, 'Accept': 'audio/mp3', 'Content-Type': 'application/json' }); const sampleRate = 8000; const audioContent = await post('/v1/rime-tts', { speaker: voice, text, modelId: model_id, samplingRate: sampleRate, lang: language, ...opts }); return { audioContent, extension: 'mp3', sampleRate }; } catch (err) { logger.info({err}, 'synth rimelabs returned error'); stats.increment('tts.count', ['vendor:rimelabs', 'accepted:no']); throw err; } }; const synthVerbio = async(client, logger, { credentials, stats, voice, key, text, renderForCaching, disableTtsStreaming, disableTtsCache }) => { //https://doc.speechcenter.verbio.com/#tag/Text-To-Speech-REST-API if (text.length > 2000) { throw new Error('Verbio cannot synthesize for the text length larger than 2000 characters'); } const token = await getVerbioAccessToken(client, logger, credentials); if (!process.env.JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) { let params = '{'; params += `access_token=${token.access_token}`; params += `,playback_id=${key}`; params += ',vendor=verbio'; params += `,voice=${voice}`; params += `,write_cache_file=${disableTtsCache ? 0 : 1}`; params += '}'; return { filePath: `say:${params}${text.replace(/\n/g, ' ')}`, servedFromCache: false, rtt: 0 }; } try { const post = bent('https://us.rest.speechcenter.verbio.com', 'POST', 'buffer', { 'Authorization': `Bearer ${token.access_token}`, 'User-Agent': 'jambonz', 'Content-Type': 'application/json' }); const audioContent = await post('/api/v1/synthesize', { voice_id: voice, output_sample_rate: '8k', output_encoding: 'pcm16', text }); return { audioContent, extension: 'r8', sampleRate: 8000 }; } catch (err) { logger.info({err}, 'synth Verbio returned error'); stats.increment('tts.count', ['vendor:verbio', 'accepted:no']); throw err; } }; const synthWhisper = async(logger, {credentials, stats, voice, key, text, instructions, renderForCaching, disableTtsStreaming, disableTtsCache}) => { const {api_key, model_id, baseURL, timeout, speed} = credentials; /* if the env is set to stream then bag out, unless we are specifically rendering to generate a cache file */ if (!JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) { let params = '{'; params += `api_key=${api_key}`; params += `,playback_id=${key}`; params += `,model_id=${model_id}`; params += ',vendor=whisper'; params += `,voice=${voice}`; params += `,write_cache_file=${disableTtsCache ? 0 : 1}`; if (speed) params += `,speed=${speed}`; // comma is used to separated parameters in freeswitch tts module if (instructions) params += `,instructions=${instructions.replace(/\n/g, ' ').replace(/,/g, ';')}`; params += '}'; return { filePath: `say:${params}${text.replace(/\n/g, ' ')}`, servedFromCache: false, rtt: 0 }; } try { const openai = new OpenAI.OpenAI({ apiKey: api_key, timeout: timeout || 5000, ...(baseURL && {baseURL}) }); const mp3 = await openai.audio.speech.create({ model: model_id, voice, input: text, ...(instructions && {instructions}), response_format: 'mp3' }); return { audioContent: Buffer.from(await mp3.arrayBuffer()), extension: 'mp3', sampleRate: 8000 }; } catch (err) { logger.info({err}, 'synth whisper returned error'); stats.increment('tts.count', ['vendor:openai', 'accepted:no']); throw err; } }; const synthDeepgram = async(logger, {credentials, stats, model, key, text, renderForCaching, disableTtsStreaming, disableTtsCache}) => { const {api_key, deepgram_tts_uri} = credentials; if (!JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) { let params = '{'; params += `api_key=${api_key}`; params += `,playback_id=${key}`; params += ',vendor=deepgram'; params += `,voice=${model}`; params += `,write_cache_file=${disableTtsCache ? 0 : 1}`; if (deepgram_tts_uri) params += `,endpoint=${deepgram_tts_uri}`; params += '}'; return { filePath: `say:${params}${text.replace(/\n/g, ' ')}`, servedFromCache: false, rtt: 0 }; } try { const post = bent(deepgram_tts_uri || 'https://api.deepgram.com', 'POST', 'buffer', { // on-premise deepgram does not require to have api_key ...(api_key && {'Authorization': `Token ${api_key}`}), 'Accept': 'audio/mpeg', 'Content-Type': 'application/json' }); const audioContent = await post(`/v1/speak?model=${model}`, { text }); return { audioContent, extension: 'mp3', sampleRate: 8000 }; } catch (err) { logger.info({err}, 'synth Deepgram returned error'); stats.increment('tts.count', ['vendor:deepgram', 'accepted:no']); throw err; } }; const synthCartesia = async(logger, { credentials, options, stats, voice, language, key, text, renderForCaching, disableTtsStreaming, disableTtsCache }) => { const {api_key, model_id, embedding, options: credOpts} = credentials; const opts = !!options && Object.keys(options).length !== 0 ? options : JSON.parse(credOpts || '{}'); if (!JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) { let params = '{'; params += `api_key=${api_key}`; params += `,playback_id=${key}`; params += `,model_id=${model_id}`; params += ',vendor=cartesia'; params += `,voice=${voice}`; params += `,write_cache_file=${disableTtsCache ? 0 : 1}`; params += `,language=${language}`; params += `,voice_mode=${embedding ? 'embedding' : 'id'}`; if (embedding) params += `,embedding=${embedding}`; if (opts.speed) params += `,speed=${opts.speed}`; if (opts.emotion) params += `,emotion=${opts.emotion}`; if (opts.volume) params += `,volume=${opts.volume}`; params += '}'; return { filePath: `say:${params}${text.replace(/\n/g, ' ').replace(/\r/g, ' ')}`, servedFromCache: false, rtt: 0 }; } try { const client = new CartesiaClient({ apiKey: api_key }); const sampleRate = 48000; const mp3Stream = await client.tts.bytes({ modelId: model_id, transcript: text, voice: { mode: embedding ? 'embedding' : 'id', ...(embedding ? { embedding: embedding.split(',').map(Number) } : { id: voice } ), ...(model_id === 'sonic-2' && (opts.speed || opts.emotion) && { experimentalControls: { ...(opts.speed !== null && opts.speed !== undefined && {speed: opts.speed}), ...(opts.emotion && {emotion: [opts.emotion]}), } }), }, ...(model_id === 'sonic-3' && (opts.speed || opts.emotion || opts.volume) && { generationConfig: { ...(opts.volume !== null && opts.volume !== undefined && {volume: opts.volume}), ...(opts.speed !== null && opts.speed !== undefined && {speed: opts.speed}), ...(opts.emotion !== null && opts.emotion !== undefined && {emotion: opts.emotion}), } }), language: language, outputFormat: { container: 'mp3', bitRate: 128000, sampleRate }, }); // bytes() returns a ReadableStream - collect all chunks const chunks = []; for await (const chunk of mp3Stream) { chunks.push(chunk); } const audioBuffer = Buffer.concat(chunks); return { audioContent: audioBuffer, extension: 'mp3', sampleRate }; } catch (err) { logger.info({err}, 'synth Cartesia returned error'); stats.increment('tts.count', ['vendor:cartesia', 'accepted:no']); throw err; } }; const synthResemble = async(logger, { credentials, options, stats, voice, key, text, renderForCaching, disableTtsStreaming, disableTtsCache }) => { const {api_key, resemble_tts_uri, resemble_tts_use_tls} = credentials; const {project_uuid, use_hd} = options || {}; /* default to using the streaming interface, unless disabled by env var OR we want just a cache file */ if (!JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) { let params = '{'; params += `api_key=${api_key}`; params += `,playback_id=${key}`; params += ',vendor=resemble'; params += `,voice=${voice}`; params += `,write_cache_file=${disableTtsCache ? 0 : 1}`; if (project_uuid) params += `,project_uuid=${project_uuid}`; if (use_hd) params += `,use_hd=${use_hd}`; if (resemble_tts_uri) params += `,endpoint=${resemble_tts_uri}`; if (resemble_tts_use_tls) params += `,use_tls=${resemble_tts_use_tls}`; params += '}'; return { filePath: `say:${params}${text.replace(/\n/g, ' ').replace(/\r/g, ' ')}`, servedFromCache: false, rtt: 0 }; } try { const baseUrl = resemble_tts_uri || 'https://f.cluster.resemble.ai'; const post = bent(baseUrl, 'POST', 'buffer', { 'Authorization': `Bearer ${api_key}`, 'Content-Type': 'application/json' }); const response = await post('/synthesize', { voice_uuid: voice, data: text, sample_rate: 8000, output_format: 'mp3', ...(project_uuid && {project_uuid}), ...(use_hd && {use_hd}), }); const json = JSON.parse(response.toString('utf8')); const audioContent = Buffer.from(json.audio_content, 'base64'); return { audioContent, extension: 'mp3', sampleRate: 8000 }; } catch (err) { logger.info({err}, 'synth Elevenlabs returned error'); stats.increment('tts.count', ['vendor:elevenlabs', 'accepted:no']); throw err; } }; const getFileExtFromMime = (mime) => { switch (mime) { case 'audio/wav': case 'audio/x-wav': return ['wav', 8000]; case /audio\/l16.*rate=8000/.test(mime) ? mime : 'cant match value': return ['r8', 8000]; case /audio\/l16.*rate=16000/.test(mime) ? mime : 'cant match value': return ['r16', 16000]; case /audio\/l16.*rate=24000/.test(mime) ? mime : 'cant match value': return ['r24', 24000]; case /audio\/l16.*rate=32000/.test(mime) ? mime : 'cant match value': return ['r32', 32000]; case /audio\/l16.*rate=48000/.test(mime) ? mime : 'cant match value': return ['r48', 48000]; case 'audio/mpeg': case 'audio/mp3': return ['mp3', 8000]; default: return ['wav', 8000]; } }; module.exports = synthAudio;