const assert = require('assert'); const fs = require('fs'); const bent = require('bent'); const ttsGoogle = require('@google-cloud/text-to-speech'); const { PollyClient, SynthesizeSpeechCommand } = require('@aws-sdk/client-polly'); const sdk = require('microsoft-cognitiveservices-speech-sdk'); const TextToSpeechV1 = require('ibm-watson/text-to-speech/v1'); const { IamAuthenticator } = require('ibm-watson/auth'); const { ResultReason, SpeechConfig, SpeechSynthesizer, CancellationDetails, SpeechSynthesisOutputFormat } = sdk; const {makeSynthKey, createNuanceClient, createKryptonClient, createRivaClient, noopLogger} = require('./utils'); const getNuanceAccessToken = require('./get-nuance-access-token'); const { SynthesisRequest, Voice, AudioFormat, AudioParameters, PCM, Input, Text, SSML, EventParameters } = require('../stubs/nuance/synthesizer_pb'); const {SynthesizeSpeechRequest} = require('../stubs/riva/proto/riva_tts_pb'); const {AudioEncoding} = require('../stubs/riva/proto/riva_audio_pb'); const debug = require('debug')('jambonz:realtimedb-helpers'); const EXPIRES = (process.env.JAMBONES_TTS_CACHE_DURATION_MINS || 24 * 60) * 60; // cache tts for 24 hours const TMP_FOLDER = '/tmp'; /** * Synthesize speech to an mp3 file, and also cache the generated speech * in redis (base64 format) for 24 hours so as to avoid unnecessarily paying * time and again for speech synthesis of the same text. * It is the responsibility of the caller to unlink the mp3 file after use. * * @param {*} client - redis client * @param {*} logger - pino logger * @param {object} opts - options * @param {string} opts.vendor - 'google' or 'aws' ('polly' is an alias for 'aws') * @param {string} opt.language - language code * @param {string} opts.voice - voice identifier * @param {string} opts.text - text or ssml to synthesize * @param {boolean} opts.disableTtsCache - disable TTS Cache retrieval * @returns object containing filepath to an mp3 file in the /tmp folder containing * the synthesized audio, and a variable indicating whether it was served from cache */ async function synthAudio(client, logger, stats, { account_sid, vendor, language, voice, gender, text, engine, salt, model, credentials, deploymentId, disableTtsCache }) { let audioBuffer; let servedFromCache = false; let rtt; logger = logger || noopLogger; assert.ok(['google', 'aws', 'polly', 'microsoft', 'wellsaid', 'nuance', 'nvidia', 'ibm'].includes(vendor) || vendor.startsWith('custom'), `synthAudio supported vendors are google, aws, microsoft, nuance, nvidia and wellsaid, not ${vendor}`); if ('google' === vendor) { assert.ok(language, 'synthAudio requires language when google is used'); } else if (['aws', 'polly'].includes(vendor)) { assert.ok(voice, 'synthAudio requires voice when aws polly is used'); } else if ('microsoft' === vendor) { assert.ok(language || deploymentId, 'synthAudio requires language when microsoft is used'); assert.ok(voice || deploymentId, 'synthAudio requires voice when microsoft is used'); } else if ('nuance' === vendor) { assert.ok(voice, 'synthAudio requires voice when nuance is used'); if (!credentials.nuance_tts_uri) { assert.ok(credentials.client_id, 'synthAudio requires client_id in credentials when nuance is used'); assert.ok(credentials.secret, 'synthAudio requires client_id in credentials when nuance is used'); } } else if ('nvidia' === vendor) { assert.ok(voice, 'synthAudio requires voice when nvidia is used'); assert.ok(language, 'synthAudio requires language when nvidia is used'); assert.ok(credentials.riva_server_uri, 'synthAudio requires riva_server_uri in credentials when nvidia is used'); } else if ('ibm' === vendor) { assert.ok(voice, 'synthAudio requires voice when ibm is used'); assert.ok(credentials.tts_region, 'synthAudio requires tts_region in credentials when ibm watson is used'); assert.ok(credentials.tts_api_key, 'synthAudio requires tts_api_key in credentials when nuance is used'); } else if ('wellsaid' === vendor) { language = 'en-US'; // WellSaid only supports English atm assert.ok(voice, 'synthAudio requires voice when wellsaid is used'); assert.ok(!text.startsWith(' logger.info(err, 'Error setting expires')); } if (!cached) { // not found in cache - go get it from speech vendor and add to cache debug('result was NOT found in cache'); stats.increment('tts.cache.requests', ['found:no']); let vendorLabel = vendor; const startAt = process.hrtime(); switch (vendor) { case 'google': audioBuffer = await synthGoogle(logger, {credentials, stats, language, voice, gender, text}); break; case 'aws': case 'polly': vendorLabel = 'aws'; audioBuffer = await synthPolly(logger, {credentials, stats, language, voice, text, engine}); break; case 'azure': case 'microsoft': vendorLabel = 'microsoft'; audioBuffer = await synthMicrosoft(logger, {credentials, stats, language, voice, text, deploymentId, filePath}); break; case 'nuance': model = model || 'enhanced'; audioBuffer = await synthNuance(client, logger, {credentials, stats, voice, model, text}); break; case 'nvidia': audioBuffer = await synthNvidia(client, logger, {credentials, stats, language, voice, model, text}); break; case 'ibm': audioBuffer = await synthIbm(logger, {credentials, stats, voice, text}); break; case 'wellsaid': audioBuffer = await synthWellSaid(logger, {credentials, stats, language, voice, text, filePath}); break; case vendor.startsWith('custom') ? vendor : 'cant_match_value': ({ audioBuffer, filePath } = await synthCustomVendor(logger, {credentials, stats, language, voice, text, filePath})); break; default: assert(`synthAudio: unsupported speech vendor ${vendor}`); } const diff = process.hrtime(startAt); const time = diff[0] * 1e3 + diff[1] * 1e-6; rtt = time.toFixed(0); stats.histogram('tts.response_time', rtt, [`vendor:${vendorLabel}`]); debug(`tts rtt time for ${text.length} chars on ${vendorLabel}: ${rtt}`); logger.info(`tts rtt time for ${text.length} chars on ${vendorLabel}: ${rtt}`); client.setex(key, EXPIRES, audioBuffer.toString('base64')) .catch((err) => logger.error(err, `error calling setex on key ${key}`)); } return new Promise((resolve, reject) => { fs.writeFile(filePath, audioBuffer, (err) => { if (err) return reject(err); resolve({filePath, servedFromCache, rtt}); }); }); } const synthPolly = async(logger, {credentials, stats, language, voice, engine, text}) => { try { const {region, accessKeyId, secretAccessKey} = credentials; const polly = new PollyClient({ region, credentials: { accessKeyId, secretAccessKey } }); const opts = { Engine: engine, OutputFormat: 'mp3', Text: text, LanguageCode: language, TextType: text.startsWith('') ? 'ssml' : 'text', VoiceId: voice }; const command = new SynthesizeSpeechCommand(opts); const data = await polly.send(command); const chunks = []; return new Promise((resolve, reject) => { data.AudioStream .on('error', (err) => { logger.info({err}, 'synthAudio: Error synthesizing speech using aws polly'); stats.increment('tts.count', ['vendor:aws', 'accepted:no']); reject(err); }) .on('data', (chunk) => { chunks.push(chunk); }) .on('end', () => resolve(Buffer.concat(chunks))); }); } catch (err) { logger.info({err}, 'synthAudio: Error synthesizing speech using aws polly'); stats.increment('tts.count', ['vendor:aws', 'accepted:no']); throw err; } }; const synthGoogle = async(logger, {credentials, stats, language, voice, gender, text}) => { const client = new ttsGoogle.TextToSpeechClient(credentials); const opts = { voice: { name: voice, languageCode: language, ssmlGender: gender || 'SSML_VOICE_GENDER_UNSPECIFIED' }, audioConfig: {audioEncoding: 'MP3'} }; Object.assign(opts, {input: text.startsWith('') ? {ssml: text} : {text}}); try { const responses = await client.synthesizeSpeech(opts); stats.increment('tts.count', ['vendor:google', 'accepted:yes']); client.close(); return responses[0].audioContent; } catch (err) { console.error(err); logger.info({err, opts}, 'synthAudio: Error synthesizing speech using google'); stats.increment('tts.count', ['vendor:google', 'accepted:no']); client && client.close(); throw err; } }; const synthIbm = async(logger, {credentials, stats, voice, text}) => { const {tts_api_key, tts_region} = credentials; const params = { text, voice, accept: 'audio/mp3' }; try { const textToSpeech = new TextToSpeechV1({ authenticator: new IamAuthenticator({ apikey: tts_api_key, }), serviceUrl: `https://api.${tts_region}.text-to-speech.watson.cloud.ibm.com` }); const r = await textToSpeech.synthesize(params); const chunks = []; for await (const chunk of r.result) { chunks.push(chunk); } return Buffer.concat(chunks); } catch (err) { logger.info({err, params}, 'synthAudio: Error synthesizing speech using ibm'); stats.increment('tts.count', ['vendor:ibm', 'accepted:no']); throw new Error(err.statusText || err.message); } }; const synthMicrosoft = async(logger, { credentials, stats, language, voice, text, filePath }) => { try { const {api_key: apiKey, region, use_custom_tts, custom_tts_endpoint} = credentials; let content = text; const speechConfig = SpeechConfig.fromSubscription(apiKey, region); speechConfig.speechSynthesisLanguage = language; speechConfig.speechSynthesisVoiceName = voice; if (use_custom_tts && custom_tts_endpoint) { speechConfig.endpointId = custom_tts_endpoint; /** * Note: it seems that to use custom voice ssml is required with the voice attribute * Otherwise sending plain text we get "Voice does not match" */ if (!content.startsWith('${text}`; } speechConfig.speechSynthesisOutputFormat = SpeechSynthesisOutputFormat.Audio16Khz32KBitRateMonoMp3; const synthesizer = new SpeechSynthesizer(speechConfig); if (content.startsWith('')) { /* microsoft enforces some properties and uses voice xml element so if the user did not supply do it for them */ const words = content.slice(7, -8).trim().replace(/(\r\n|\n|\r)/gm, ' '); // eslint-disable-next-line max-len content = `${words}`; logger.info({content}, 'synthMicrosoft'); } return new Promise((resolve, reject) => { const speakAsync = content.startsWith(' { switch (result.reason) { case ResultReason.Canceled: const cancellation = CancellationDetails.fromResult(result); logger.info({reason: cancellation.errorDetails}, 'synthAudio: (Microsoft) synthesis canceled'); synthesizer.close(); reject(cancellation.errorDetails); break; case ResultReason.SynthesizingAudioCompleted: resolve(Buffer.from(result.audioData)); synthesizer.close(); stats.increment('tts.count', ['vendor:microsoft', 'accepted:yes']); break; default: logger.info({result}, 'synthAudio: (Microsoft) unexpected result'); break; } }, (err) => { logger.info({err}, 'synthAudio: (Microsoft) error synthesizing'); stats.increment('tts.count', ['vendor:microsoft', 'accepted:no']); synthesizer.close(); reject(err); }); }); } catch (err) { logger.info({err}, 'synthAudio: Error synthesizing speech using Microsoft'); stats.increment('tts.count', ['vendor:google', 'accepted:no']); } }; const synthWellSaid = async(logger, {credentials, stats, language, voice, gender, text}) => { const {api_key} = credentials; try { const post = bent('https://api.wellsaidlabs.com', 'POST', 'buffer', { 'X-Api-Key': api_key, 'Accept': 'audio/mpeg', 'Content-Type': 'application/json' }); const mp3 = await post('/v1/tts/stream', { text, speaker_id: voice }); return mp3; } catch (err) { logger.info({err}, 'testWellSaidTts returned error'); throw err; } }; const synthNuance = async(client, logger, {credentials, stats, voice, model, text}) => { let nuanceClient; const {client_id, secret, nuance_tts_uri} = credentials; if (nuance_tts_uri) { nuanceClient = await createKryptonClient(nuance_tts_uri); } else { /* get a nuance access token */ const {access_token} = await getNuanceAccessToken(client, logger, client_id, secret, 'tts'); nuanceClient = await createNuanceClient(access_token); } const v = new Voice(); const p = new AudioParameters(); const f = new AudioFormat(); const pcm = new PCM(); const params = new EventParameters(); const request = new SynthesisRequest(); const input = new Input(); if (text.startsWith(' { nuanceClient.unarySynthesize(request, (err, response) => { if (err) { console.error(err); return reject(err); } const status = response.getStatus(); const code = status.getCode(); if (code !== 200) { const message = status.getMessage(); const details = status.getDetails(); return reject({code, message, details}); } resolve(Buffer.from(response.getAudio())); }); }); }; const synthNvidia = async(client, logger, {credentials, stats, language, voice, model, text}) => { const {riva_server_uri} = credentials; let rivaClient, request; try { rivaClient = await createRivaClient(riva_server_uri); request = new SynthesizeSpeechRequest(); request.setVoiceName(voice); request.setLanguageCode(language); request.setSampleRateHz(8000); request.setEncoding(AudioEncoding.LINEAR_PCM); request.setText(text); } catch (err) { logger.info({err}, 'error creating riva client'); return Promise.reject(err); } return new Promise((resolve, reject) => { rivaClient.synthesize(request, (err, response) => { if (err) { logger.info({err, voice, language}, 'error synthesizing speech using Nvidia'); return reject(err); } resolve(Buffer.from(response.getAudio())); }); }); }; const synthCustomVendor = async(logger, {credentials, stats, language, voice, text, filePath}) => { const {vendor, auth_token, custom_tts_url} = credentials; try { const post = bent('POST', { 'Authorization': `Bearer ${auth_token}`, 'Content-Type': 'application/json' }); const response = await post(custom_tts_url, { language, voice, type: text.startsWith('') ? 'ssml' : 'text', text }); const regex = /\.[^\.]*$/g; const mime = response.headers['content-type']; const buffer = await response.arrayBuffer(); return { audioBuffer: buffer, filePath: filePath.replace(regex, getFileExtFromMime(mime)) }; } catch (err) { logger.info({err}, `Vendor ${vendor} returned error`); throw err; } }; const getFileExtFromMime = (mime) => { switch (mime) { case 'audio/wav': case 'audio/x-wav': return '.wav'; case /audio\/l16.*rate=8000/.test(mime) ? mime : 'cant match value': return '.r8'; case /audio\/l16.*rate=16000/.test(mime) ? mime : 'cant match value': return '.r16'; case /audio\/l16.*rate=24000/.test(mime) ? mime : 'cant match value': return '.r24'; case /audio\/l16.*rate=32000/.test(mime) ? mime : 'cant match value': return '.r32'; case /audio\/l16.*rate=48000/.test(mime) ? mime : 'cant match value': return '.r48'; case 'audio/mpeg': case 'audio/mp3': return '.mp3'; default: return '.wav'; } }; module.exports = synthAudio;