mirror of
https://github.com/jambonz/speech-utils.git
synced 2025-12-19 03:37:49 +00:00
1468 lines
52 KiB
JavaScript
1468 lines
52 KiB
JavaScript
const assert = require('assert');
|
|
const fs = require('fs');
|
|
const bent = require('bent');
|
|
const ttsGoogle = require('@google-cloud/text-to-speech');
|
|
const { PollyClient, SynthesizeSpeechCommand } = require('@aws-sdk/client-polly');
|
|
const { CartesiaClient } = require('@cartesia/cartesia-js');
|
|
|
|
const sdk = require('microsoft-cognitiveservices-speech-sdk');
|
|
const TextToSpeechV1 = require('ibm-watson/text-to-speech/v1');
|
|
const { IamAuthenticator } = require('ibm-watson/auth');
|
|
const {
|
|
ResultReason,
|
|
SpeechConfig,
|
|
SpeechSynthesizer,
|
|
CancellationDetails,
|
|
SpeechSynthesisOutputFormat
|
|
} = sdk;
|
|
const {
|
|
makeSynthKey,
|
|
createNuanceClient,
|
|
createKryptonClient,
|
|
createRivaClient,
|
|
noopLogger,
|
|
makeFilePath,
|
|
makePlayhtKey
|
|
} = require('./utils');
|
|
const getNuanceAccessToken = require('./get-nuance-access-token');
|
|
const getVerbioAccessToken = require('./get-verbio-token');
|
|
const {
|
|
SynthesisRequest,
|
|
Voice,
|
|
AudioFormat,
|
|
AudioParameters,
|
|
PCM,
|
|
Input,
|
|
Text,
|
|
SSML,
|
|
EventParameters
|
|
} = require('../stubs/nuance/synthesizer_pb');
|
|
const {SynthesizeSpeechRequest} = require('../stubs/riva/proto/riva_tts_pb');
|
|
const {AudioEncoding} = require('../stubs/riva/proto/riva_audio_pb');
|
|
const debug = require('debug')('jambonz:realtimedb-helpers');
|
|
const {
|
|
JAMBONES_DISABLE_TTS_STREAMING,
|
|
JAMBONES_DISABLE_AZURE_TTS_STREAMING,
|
|
JAMBONES_HTTP_PROXY_IP,
|
|
JAMBONES_HTTP_PROXY_PORT,
|
|
JAMBONES_TTS_CACHE_DURATION_MINS,
|
|
JAMBONES_TTS_TRIM_SILENCE,
|
|
JAMBONES_AZURE_ENABLE_SSML
|
|
} = require('./config');
|
|
const EXPIRES = JAMBONES_TTS_CACHE_DURATION_MINS;
|
|
const OpenAI = require('openai');
|
|
const getAwsAuthToken = require('./get-aws-sts-token');
|
|
|
|
const trimTrailingSilence = (buffer) => {
|
|
assert.ok(buffer instanceof Buffer, 'trimTrailingSilence - argument is not a Buffer');
|
|
|
|
let offset = buffer.length;
|
|
while (offset > 0) {
|
|
// Get 16-bit value from the buffer (read in reverse)
|
|
const value = buffer.readUInt16BE(offset - 2);
|
|
if (value !== 0) {
|
|
break;
|
|
}
|
|
offset -= 2;
|
|
}
|
|
|
|
// Trim the silence from the end
|
|
return offset === buffer.length ? buffer : buffer.subarray(0, offset);
|
|
};
|
|
|
|
/**
|
|
* Synthesize speech to an mp3 file, and also cache the generated speech
|
|
* in redis (base64 format) for 24 hours so as to avoid unnecessarily paying
|
|
* time and again for speech synthesis of the same text.
|
|
* It is the responsibility of the caller to unlink the mp3 file after use.
|
|
*
|
|
* @param {*} client - redis client
|
|
* @param {*} logger - pino logger
|
|
* @param {object} opts - options
|
|
* @param {string} opts.vendor - 'google' or 'aws' ('polly' is an alias for 'aws')
|
|
* @param {string} opt.language - language code
|
|
* @param {string} opts.voice - voice identifier
|
|
* @param {string} opts.text - text or ssml to synthesize
|
|
* @param {boolean} opts.disableTtsCache - disable TTS Cache retrieval
|
|
* @returns object containing filepath to an mp3 file in the /tmp folder containing
|
|
* the synthesized audio, and a variable indicating whether it was served from cache
|
|
*/
|
|
async function synthAudio(client, createHash, retrieveHash, logger, stats, { account_sid,
|
|
vendor, language, voice, gender, text, engine, salt, model, credentials, deploymentId,
|
|
disableTtsCache, renderForCaching = false, disableTtsStreaming, options, instructions
|
|
}) {
|
|
let audioData;
|
|
let servedFromCache = false;
|
|
let rtt;
|
|
logger = logger || noopLogger;
|
|
|
|
assert.ok(['google', 'aws', 'polly', 'microsoft', 'wellsaid', 'nuance', 'nvidia', 'ibm', 'elevenlabs',
|
|
'whisper', 'deepgram', 'playht', 'rimelabs', 'verbio', 'cartesia', 'inworld', 'resemble'].includes(vendor) ||
|
|
vendor.startsWith('custom'),
|
|
`synthAudio supported vendors are google, aws, microsoft, nuance, nvidia and wellsaid ..etc, not ${vendor}`);
|
|
if ('google' === vendor) {
|
|
assert.ok(language, 'synthAudio requires language when google is used');
|
|
}
|
|
else if (['aws', 'polly'].includes(vendor)) {
|
|
assert.ok(voice, 'synthAudio requires voice when aws polly is used');
|
|
}
|
|
else if ('microsoft' === vendor) {
|
|
assert.ok(language || deploymentId, 'synthAudio requires language when microsoft is used');
|
|
assert.ok(voice || deploymentId, 'synthAudio requires voice when microsoft is used');
|
|
}
|
|
else if ('nuance' === vendor) {
|
|
assert.ok(voice, 'synthAudio requires voice when nuance is used');
|
|
if (!credentials.nuance_tts_uri) {
|
|
assert.ok(credentials.client_id, 'synthAudio requires client_id in credentials when nuance is used');
|
|
assert.ok(credentials.secret, 'synthAudio requires client_id in credentials when nuance is used');
|
|
}
|
|
}
|
|
else if ('nvidia' === vendor) {
|
|
assert.ok(voice, 'synthAudio requires voice when nvidia is used');
|
|
assert.ok(language, 'synthAudio requires language when nvidia is used');
|
|
assert.ok(credentials.riva_server_uri, 'synthAudio requires riva_server_uri in credentials when nvidia is used');
|
|
}
|
|
else if ('ibm' === vendor) {
|
|
assert.ok(voice, 'synthAudio requires voice when ibm is used');
|
|
assert.ok(credentials.tts_region, 'synthAudio requires tts_region in credentials when ibm watson is used');
|
|
assert.ok(credentials.tts_api_key, 'synthAudio requires tts_api_key in credentials when nuance is used');
|
|
}
|
|
else if ('wellsaid' === vendor) {
|
|
language = 'en-US'; // WellSaid only supports English atm
|
|
assert.ok(voice, 'synthAudio requires voice when wellsaid is used');
|
|
assert.ok(!text.startsWith('<speak'), 'wellsaid does not support SSML tags');
|
|
} else if ('elevenlabs' === vendor) {
|
|
assert.ok(voice, 'synthAudio requires voice when elevenlabs is used');
|
|
assert.ok(credentials.api_key, 'synthAudio requires api_key when elevenlabs is used');
|
|
assert.ok(credentials.model_id, 'synthAudio requires model_id when elevenlabs is used');
|
|
} else if ('playht' === vendor) {
|
|
assert.ok(voice, 'synthAudio requires voice when playht is used');
|
|
assert.ok(credentials.api_key, 'synthAudio requires api_key when playht is used');
|
|
assert.ok(credentials.user_id, 'synthAudio requires user_id when playht is used');
|
|
assert.ok(credentials.voice_engine, 'synthAudio requires voice_engine when playht is used');
|
|
} else if ('inworld' === vendor) {
|
|
assert.ok(voice, 'synthAudio requires voice when inworld is used');
|
|
assert.ok(credentials.api_key, 'synthAudio requires api_key when inworld is used');
|
|
assert.ok(credentials.model_id, 'synthAudio requires model_id when inworld is used');
|
|
} else if ('rimelabs' === vendor) {
|
|
assert.ok(voice, 'synthAudio requires voice when rimelabs is used');
|
|
assert.ok(credentials.api_key, 'synthAudio requires api_key when rimelabs is used');
|
|
assert.ok(credentials.model_id, 'synthAudio requires model_id when rimelabs is used');
|
|
} else if ('whisper' === vendor) {
|
|
assert.ok(voice, 'synthAudio requires voice when whisper is used');
|
|
assert.ok(credentials.model_id, 'synthAudio requires model when whisper is used');
|
|
assert.ok(credentials.api_key, 'synthAudio requires api_key when whisper is used');
|
|
} else if (vendor.startsWith('custom')) {
|
|
assert.ok(credentials.custom_tts_url, `synthAudio requires custom_tts_url in credentials when ${vendor} is used`);
|
|
} else if ('verbio' === vendor) {
|
|
assert.ok(voice, 'synthAudio requires voice when verbio is used');
|
|
assert.ok(credentials.client_id, 'synthAudio requires client_id when verbio is used');
|
|
assert.ok(credentials.client_secret, 'synthAudio requires client_secret when verbio is used');
|
|
} else if ('deepgram' === vendor) {
|
|
if (!credentials.deepgram_tts_uri) {
|
|
assert.ok(credentials.api_key, 'synthAudio requires api_key when deepgram is used');
|
|
}
|
|
} else if ('cartesia' === vendor) {
|
|
assert.ok(credentials.api_key, 'synthAudio requires api_key when cartesia is used');
|
|
assert.ok(credentials.model_id, 'synthAudio requires model_id when cartesia is used');
|
|
} else if (vendor === 'resemble') {
|
|
assert.ok(voice, 'synthAudio requires voice when resemble is used');
|
|
assert.ok(credentials.api_key, 'synthAudio requires api_key when resemble is used');
|
|
}
|
|
|
|
const key = makeSynthKey({
|
|
account_sid,
|
|
vendor,
|
|
language: language || '',
|
|
voice: voice || deploymentId,
|
|
engine,
|
|
// model or model_id is used to identify the tts cache.
|
|
model: model || credentials.model_id,
|
|
text,
|
|
instructions
|
|
});
|
|
|
|
debug(`synth key is ${key}`);
|
|
let cached;
|
|
if (!disableTtsCache) {
|
|
cached = await client.get(key);
|
|
}
|
|
if (cached) {
|
|
// found in cache - extend the expiry and use it
|
|
debug('result WAS found in cache');
|
|
servedFromCache = true;
|
|
stats.increment('tts.cache.requests', ['found:yes']);
|
|
audioData = JSON.parse(cached);
|
|
// convert base64 audio to buffer
|
|
audioData.audioContent = Buffer.from(audioData.audioContent, 'base64');
|
|
client.expire(key, EXPIRES).catch((err) => logger.info(err, 'Error setting expires'));
|
|
} else {
|
|
// not found in cache - go get it from speech vendor and add to cache
|
|
debug('result was NOT found in cache');
|
|
stats.increment('tts.cache.requests', ['found:no']);
|
|
let vendorLabel = vendor;
|
|
const startAt = process.hrtime();
|
|
switch (vendor) {
|
|
case 'google':
|
|
audioData = await synthGoogle(logger, {credentials, stats, language, voice, gender, key, text});
|
|
break;
|
|
case 'aws':
|
|
case 'polly':
|
|
vendorLabel = 'aws';
|
|
audioData = await synthPolly(createHash, retrieveHash, logger,
|
|
{credentials, stats, language, voice, key, text, engine, renderForCaching, disableTtsStreaming,
|
|
disableTtsCache});
|
|
break;
|
|
case 'azure':
|
|
case 'microsoft':
|
|
vendorLabel = 'microsoft';
|
|
audioData = await synthMicrosoft(logger, {credentials, stats, language, voice, key, text, deploymentId,
|
|
renderForCaching, disableTtsStreaming, disableTtsCache});
|
|
break;
|
|
case 'nuance':
|
|
model = model || 'enhanced';
|
|
audioData = await synthNuance(client, logger, {credentials, stats, voice, model, key, text});
|
|
break;
|
|
case 'nvidia':
|
|
audioData = await synthNvidia(client, logger, {credentials, stats, language, voice, model, key, text,
|
|
renderForCaching, disableTtsStreaming, disableTtsCache});
|
|
break;
|
|
case 'ibm':
|
|
audioData = await synthIbm(logger, {credentials, stats, voice, key, text});
|
|
break;
|
|
case 'wellsaid':
|
|
audioData = await synthWellSaid(logger, {credentials, stats, language, voice, key, text});
|
|
break;
|
|
case 'elevenlabs':
|
|
audioData = await synthElevenlabs(logger, {
|
|
credentials, options, stats, language, voice, key, text, renderForCaching, disableTtsStreaming,
|
|
disableTtsCache});
|
|
break;
|
|
case 'playht':
|
|
audioData = await synthPlayHT(client, logger, {
|
|
credentials, options, stats, language, voice, key, text, renderForCaching, disableTtsStreaming,
|
|
disableTtsCache});
|
|
break;
|
|
case 'cartesia':
|
|
audioData = await synthCartesia(logger, {
|
|
credentials, options, stats, language, voice, key, text, renderForCaching, disableTtsStreaming,
|
|
disableTtsCache});
|
|
break;
|
|
case 'inworld':
|
|
audioData = await synthInworld(logger, {
|
|
credentials, options, stats, language, voice, key, text, renderForCaching, disableTtsStreaming,
|
|
disableTtsCache});
|
|
break;
|
|
case 'rimelabs':
|
|
audioData = await synthRimelabs(logger, {
|
|
credentials, options, stats, language, voice, key, text, renderForCaching, disableTtsStreaming,
|
|
disableTtsCache});
|
|
break;
|
|
case 'whisper':
|
|
audioData = await synthWhisper(logger, {
|
|
credentials, stats, voice, key, text, instructions, renderForCaching, disableTtsStreaming,
|
|
disableTtsCache});
|
|
break;
|
|
case 'verbio':
|
|
audioData = await synthVerbio(client, logger, {
|
|
credentials, stats, voice, key, text, renderForCaching, disableTtsStreaming, disableTtsCache});
|
|
if (audioData?.filePath) return audioData;
|
|
break;
|
|
case 'deepgram':
|
|
audioData = await synthDeepgram(logger, {credentials, stats, model, key, text,
|
|
renderForCaching, disableTtsStreaming, disableTtsCache});
|
|
break;
|
|
case 'resemble':
|
|
audioData = await synthResemble(logger, {
|
|
credentials, stats, voice, key, text, options, renderForCaching, disableTtsStreaming, disableTtsCache});
|
|
break;
|
|
case vendor.startsWith('custom') ? vendor : 'cant_match_value':
|
|
audioData = await synthCustomVendor(logger,
|
|
{credentials, stats, language, voice, key, text, renderForCaching, disableTtsStreaming, disableTtsCache});
|
|
break;
|
|
default:
|
|
assert(`synthAudio: unsupported speech vendor ${vendor}`);
|
|
}
|
|
if ('filePath' in audioData) return audioData;
|
|
const diff = process.hrtime(startAt);
|
|
const time = diff[0] * 1e3 + diff[1] * 1e-6;
|
|
rtt = time.toFixed(0);
|
|
stats.histogram('tts.response_time', rtt, [`vendor:${vendorLabel}`]);
|
|
debug(`tts rtt time for ${text.length} chars on ${vendorLabel}: ${rtt}`);
|
|
logger.info(`tts rtt time for ${text.length} chars on ${vendorLabel}: ${rtt}`);
|
|
// Save audio json to cache
|
|
client.setex(key, EXPIRES, JSON.stringify({
|
|
...audioData,
|
|
audioContent: audioData.audioContent?.toString('base64')
|
|
}))
|
|
.catch((err) => logger.error(err, `error calling setex on key ${key}`));
|
|
}
|
|
|
|
return new Promise((resolve, reject) => {
|
|
const { audioContent, extension } = audioData;
|
|
const filePath = makeFilePath({
|
|
key,
|
|
salt,
|
|
extension
|
|
});
|
|
fs.writeFile(filePath, audioContent, (err) => {
|
|
if (err) return reject(err);
|
|
resolve({filePath, servedFromCache, rtt});
|
|
});
|
|
});
|
|
}
|
|
|
|
const synthPolly = async(createHash, retrieveHash, logger,
|
|
{credentials, stats, language, voice, engine, key, text, renderForCaching, disableTtsStreaming, disableTtsCache}) => {
|
|
const {region, accessKeyId, secretAccessKey, roleArn} = credentials;
|
|
if (!JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) {
|
|
|
|
let params = '{';
|
|
params += `language=${language}`;
|
|
params += `,playback_id=${key}`;
|
|
params += `,write_cache_file=${disableTtsCache ? 0 : 1}`;
|
|
params += ',vendor=aws';
|
|
if (accessKeyId && secretAccessKey) {
|
|
if (accessKeyId) params += `,accessKeyId=${accessKeyId}`;
|
|
if (secretAccessKey) params += `,secretAccessKey=${secretAccessKey}`;
|
|
} else if (roleArn) {
|
|
const cred = await getAwsAuthToken(
|
|
logger, createHash, retrieveHash,
|
|
{
|
|
region,
|
|
roleArn
|
|
});
|
|
|
|
if (cred) {
|
|
params += `,accessKeyId=${cred.accessKeyId}`;
|
|
params += `,secretAccessKey=${cred.secretAccessKey}`;
|
|
params += `,sessionToken=${cred.sessionToken}`;
|
|
}
|
|
}
|
|
if (region) params += `,region=${region}`;
|
|
if (engine) params += `,engine=${engine}`;
|
|
params += '}';
|
|
|
|
return {
|
|
filePath: `say:${params}${text.replace(/\n/g, ' ').replace(/\r/g, ' ')}`,
|
|
servedFromCache: false,
|
|
rtt: 0
|
|
};
|
|
}
|
|
try {
|
|
let polly;
|
|
if (accessKeyId && secretAccessKey) {
|
|
polly = new PollyClient({
|
|
region,
|
|
credentials: {
|
|
accessKeyId,
|
|
secretAccessKey
|
|
}
|
|
});
|
|
} else if (roleArn) {
|
|
polly = new PollyClient({
|
|
region,
|
|
credentials: await getAwsAuthToken(
|
|
logger, createHash, retrieveHash,
|
|
{
|
|
region,
|
|
roleArn
|
|
}),
|
|
});
|
|
} else {
|
|
// AWS RoleArn assigned to Instance profile
|
|
polly = new PollyClient({region});
|
|
}
|
|
const opts = {
|
|
Engine: engine,
|
|
OutputFormat: 'mp3',
|
|
Text: text,
|
|
LanguageCode: language,
|
|
TextType: text.startsWith('<speak>') ? 'ssml' : 'text',
|
|
VoiceId: voice
|
|
};
|
|
const command = new SynthesizeSpeechCommand(opts);
|
|
const data = await polly.send(command);
|
|
const chunks = [];
|
|
return new Promise((resolve, reject) => {
|
|
data.AudioStream
|
|
.on('error', (err) => {
|
|
logger.info({err}, 'synthAudio: Error synthesizing speech using aws polly');
|
|
stats.increment('tts.count', ['vendor:aws', 'accepted:no']);
|
|
reject(err);
|
|
})
|
|
.on('data', (chunk) => {
|
|
chunks.push(chunk);
|
|
})
|
|
.on('end', () => resolve(
|
|
{
|
|
audioContent: Buffer.concat(chunks),
|
|
extension: 'mp3',
|
|
sampleRate: 8000
|
|
}
|
|
));
|
|
});
|
|
} catch (err) {
|
|
logger.info({err}, 'synthAudio: Error synthesizing speech using aws polly');
|
|
stats.increment('tts.count', ['vendor:aws', 'accepted:no']);
|
|
throw err;
|
|
}
|
|
};
|
|
|
|
const synthGoogle = async(logger, {credentials, stats, language, voice, gender, text}) => {
|
|
const client = new ttsGoogle.TextToSpeechClient(credentials);
|
|
// If google custom voice cloning is used.
|
|
// At this time 31 Oct 2024, google node sdk has not support voice cloning yet.
|
|
if (typeof voice === 'object' && voice.voice_cloning_key) {
|
|
try {
|
|
const accessToken = await client.auth.getAccessToken();
|
|
const projectId = await client.getProjectId();
|
|
|
|
const post = bent('https://texttospeech.googleapis.com', 'POST', 'json', {
|
|
'Authorization': `Bearer ${accessToken}`,
|
|
'x-goog-user-project': projectId,
|
|
'Content-Type': 'application/json; charset=utf-8'
|
|
});
|
|
|
|
const payload = {
|
|
input: {
|
|
text
|
|
},
|
|
voice: {
|
|
language_code: language,
|
|
voice_clone: {
|
|
voice_cloning_key: voice.voice_cloning_key
|
|
}
|
|
},
|
|
audioConfig: {
|
|
// Cloning voice at this time still in v1 beta version, and it support LINEAR16 in Wav format, 24.000Hz
|
|
audioEncoding: 'LINEAR16',
|
|
sample_rate_hertz: 24000
|
|
}
|
|
};
|
|
|
|
const wav = await post('/v1beta1/text:synthesize', payload);
|
|
return {
|
|
audioContent: Buffer.from(wav.audioContent, 'base64'),
|
|
extension: 'wav',
|
|
sampleRate: 24000
|
|
};
|
|
} catch (err) {
|
|
logger.info({err: await err.text()}, 'synthGoogle returned error');
|
|
throw err;
|
|
}
|
|
}
|
|
|
|
const opts = {
|
|
voice: {
|
|
...(typeof voice === 'string' && {name: voice}),
|
|
...(typeof voice === 'object' && {customVoice: voice}),
|
|
languageCode: language,
|
|
ssmlGender: gender || 'SSML_VOICE_GENDER_UNSPECIFIED'
|
|
},
|
|
audioConfig: {audioEncoding: 'MP3'}
|
|
};
|
|
Object.assign(opts, {input: text.startsWith('<speak>') ? {ssml: text} : {text}});
|
|
try {
|
|
const responses = await client.synthesizeSpeech(opts);
|
|
stats.increment('tts.count', ['vendor:google', 'accepted:yes']);
|
|
client.close();
|
|
return {
|
|
audioContent: responses[0].audioContent,
|
|
extension: 'mp3',
|
|
sampleRate: 8000
|
|
};
|
|
} catch (err) {
|
|
console.error(err);
|
|
logger.info({err, opts}, 'synthAudio: Error synthesizing speech using google');
|
|
stats.increment('tts.count', ['vendor:google', 'accepted:no']);
|
|
client && client.close();
|
|
throw err;
|
|
}
|
|
};
|
|
|
|
const synthIbm = async(logger, {credentials, stats, voice, text}) => {
|
|
const {tts_api_key, tts_region} = credentials;
|
|
const params = {
|
|
text,
|
|
voice,
|
|
accept: 'audio/mp3'
|
|
};
|
|
|
|
try {
|
|
const textToSpeech = new TextToSpeechV1({
|
|
authenticator: new IamAuthenticator({
|
|
apikey: tts_api_key,
|
|
}),
|
|
serviceUrl: `https://api.${tts_region}.text-to-speech.watson.cloud.ibm.com`
|
|
});
|
|
|
|
const r = await textToSpeech.synthesize(params);
|
|
const chunks = [];
|
|
for await (const chunk of r.result) {
|
|
chunks.push(chunk);
|
|
}
|
|
return {
|
|
audioContent: Buffer.concat(chunks),
|
|
extension: 'mp3',
|
|
sampleRate: 8000
|
|
};
|
|
} catch (err) {
|
|
logger.info({err, params}, 'synthAudio: Error synthesizing speech using ibm');
|
|
stats.increment('tts.count', ['vendor:ibm', 'accepted:no']);
|
|
throw new Error(err.statusText || err.message);
|
|
}
|
|
};
|
|
|
|
async function _synthOnPremMicrosoft(logger, {
|
|
credentials,
|
|
language,
|
|
voice,
|
|
text
|
|
}) {
|
|
const {use_custom_tts, custom_tts_endpoint_url, api_key} = credentials;
|
|
let content = text;
|
|
if (use_custom_tts && !content.startsWith('<speak')) {
|
|
/**
|
|
* Note: it seems that to use custom voice ssml is required with the voice attribute
|
|
* Otherwise sending plain text we get "Voice does not match"
|
|
*/
|
|
content = `<speak>${text}</speak>`;
|
|
}
|
|
|
|
if (content.startsWith('<speak>')) {
|
|
/* microsoft enforces some properties and uses voice xml element so if the user did not supply do it for them */
|
|
const words = content.slice(7, -8).trim().replace(/(\r\n|\n|\r)/gm, ' ');
|
|
// eslint-disable-next-line max-len
|
|
content = `<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xmlns:mstts="https://www.w3.org/2001/mstts" xml:lang="${language}"><voice name="${voice}">${words}</voice></speak>`;
|
|
logger.info({content}, 'synthMicrosoft');
|
|
}
|
|
else if (JAMBONES_AZURE_ENABLE_SSML && !content.startsWith('<speak')) {
|
|
// eslint-disable-next-line max-len
|
|
content = `<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xmlns:mstts="https://www.w3.org/2001/mstts" xml:lang="${language}"><voice name="${voice}"><lang xml:lang="${language}">${text}</lang></voice></speak>`;
|
|
}
|
|
|
|
try {
|
|
const trimSilence = JAMBONES_TTS_TRIM_SILENCE;
|
|
const post = bent('POST', 'buffer', {
|
|
'X-Microsoft-OutputFormat': trimSilence ? 'raw-8khz-16bit-mono-pcm' : 'audio-16khz-32kbitrate-mono-mp3',
|
|
'Content-Type': 'application/ssml+xml',
|
|
'User-Agent': 'Jambonz',
|
|
...(api_key && {'Ocp-Apim-Subscription-Key': api_key})
|
|
});
|
|
const audioContent = await post(custom_tts_endpoint_url, content);
|
|
return {
|
|
audioContent,
|
|
extension: trimSilence ? 'r8' : 'mp3',
|
|
sampleRate: 8000
|
|
};
|
|
} catch (err) {
|
|
logger.info({err}, '_synthMicrosoftByHttp returned error');
|
|
throw err;
|
|
}
|
|
}
|
|
|
|
const synthMicrosoft = async(logger, {
|
|
credentials,
|
|
stats,
|
|
language,
|
|
voice,
|
|
key,
|
|
text,
|
|
renderForCaching,
|
|
disableTtsStreaming,
|
|
disableTtsCache
|
|
}) => {
|
|
try {
|
|
const {api_key: apiKey, region, use_custom_tts, custom_tts_endpoint, custom_tts_endpoint_url} = credentials;
|
|
// let clean up the text
|
|
let content = text;
|
|
if (use_custom_tts && !content.startsWith('<speak')) {
|
|
/**
|
|
* Note: it seems that to use custom voice ssml is required with the voice attribute
|
|
* Otherwise sending plain text we get "Voice does not match"
|
|
*/
|
|
content = `<speak>${text}</speak>`;
|
|
}
|
|
|
|
if (content.startsWith('<speak>')) {
|
|
/* microsoft enforces some properties and uses voice xml element so if the user did not supply do it for them */
|
|
const words = content.slice(7, -8).trim().replace(/(\r\n|\n|\r)/gm, ' ');
|
|
// eslint-disable-next-line max-len
|
|
content = `<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xmlns:mstts="https://www.w3.org/2001/mstts" xml:lang="${language}"><voice name="${voice}">${words}</voice></speak>`;
|
|
logger.info({content}, 'synthMicrosoft');
|
|
}
|
|
else if (JAMBONES_AZURE_ENABLE_SSML && !content.startsWith('<speak')) {
|
|
// eslint-disable-next-line max-len
|
|
content = `<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xmlns:mstts="https://www.w3.org/2001/mstts" xml:lang="${language}"><voice name="${voice}"><lang xml:lang="${language}">${text}</lang></voice></speak>`;
|
|
}
|
|
if (!JAMBONES_DISABLE_TTS_STREAMING && !JAMBONES_DISABLE_AZURE_TTS_STREAMING &&
|
|
!renderForCaching && !disableTtsStreaming) {
|
|
let params = '{';
|
|
params += `api_key=${apiKey}`;
|
|
params += `,playback_id=${key}`;
|
|
params += `,language=${language}`;
|
|
params += ',vendor=microsoft';
|
|
params += `,voice=${voice}`;
|
|
params += `,write_cache_file=${disableTtsCache ? 0 : 1}`;
|
|
if (region) params += `,region=${region}`;
|
|
if (custom_tts_endpoint) params += `,endpointId=${custom_tts_endpoint}`;
|
|
if (custom_tts_endpoint_url) params += `,endpoint=${custom_tts_endpoint_url}`;
|
|
if (JAMBONES_HTTP_PROXY_IP) params += `,http_proxy_ip=${JAMBONES_HTTP_PROXY_IP}`;
|
|
if (JAMBONES_HTTP_PROXY_PORT) params += `,http_proxy_port=${JAMBONES_HTTP_PROXY_PORT}`;
|
|
params += '}';
|
|
return {
|
|
filePath: `say:${params}${content.replace(/\n/g, ' ')}`,
|
|
servedFromCache: false,
|
|
rtt: 0
|
|
};
|
|
}
|
|
// Azure Onprem
|
|
if (use_custom_tts && custom_tts_endpoint_url) {
|
|
return await _synthOnPremMicrosoft(logger, {
|
|
credentials,
|
|
stats,
|
|
language,
|
|
voice,
|
|
text
|
|
});
|
|
}
|
|
// Azure hosted service
|
|
const trimSilence = JAMBONES_TTS_TRIM_SILENCE;
|
|
const speechConfig = SpeechConfig.fromSubscription(apiKey, region);
|
|
speechConfig.speechSynthesisLanguage = language;
|
|
speechConfig.speechSynthesisVoiceName = voice;
|
|
if (use_custom_tts && custom_tts_endpoint) {
|
|
speechConfig.endpointId = custom_tts_endpoint;
|
|
}
|
|
speechConfig.speechSynthesisOutputFormat = trimSilence ?
|
|
SpeechSynthesisOutputFormat.Raw8Khz16BitMonoPcm :
|
|
SpeechSynthesisOutputFormat.Audio16Khz32KBitRateMonoMp3;
|
|
|
|
if (JAMBONES_HTTP_PROXY_IP && JAMBONES_HTTP_PROXY_PORT) {
|
|
logger.debug(
|
|
`synthMicrosoft: using proxy ${JAMBONES_HTTP_PROXY_IP}:${JAMBONES_HTTP_PROXY_PORT}`);
|
|
speechConfig.setProxy(JAMBONES_HTTP_PROXY_IP, JAMBONES_HTTP_PROXY_PORT);
|
|
}
|
|
const synthesizer = new SpeechSynthesizer(speechConfig);
|
|
|
|
return new Promise((resolve, reject) => {
|
|
const speakAsync = content.startsWith('<speak') ?
|
|
synthesizer.speakSsmlAsync.bind(synthesizer) :
|
|
synthesizer.speakTextAsync.bind(synthesizer);
|
|
speakAsync(
|
|
content,
|
|
async(result) => {
|
|
switch (result.reason) {
|
|
case ResultReason.Canceled:
|
|
const cancellation = CancellationDetails.fromResult(result);
|
|
logger.info({reason: cancellation.errorDetails}, 'synthAudio: (Microsoft) synthesis canceled');
|
|
synthesizer.close();
|
|
reject(cancellation.errorDetails);
|
|
break;
|
|
case ResultReason.SynthesizingAudioCompleted:
|
|
let buffer = Buffer.from(result.audioData);
|
|
if (trimSilence) buffer = trimTrailingSilence(buffer);
|
|
resolve({
|
|
audioContent: buffer,
|
|
extension: trimSilence ? 'r8' : 'mp3',
|
|
sampleRate: 8000
|
|
});
|
|
synthesizer.close();
|
|
stats.increment('tts.count', ['vendor:microsoft', 'accepted:yes']);
|
|
break;
|
|
default:
|
|
logger.info({result}, 'synthAudio: (Microsoft) unexpected result');
|
|
break;
|
|
}
|
|
},
|
|
(err) => {
|
|
logger.info({err}, 'synthAudio: (Microsoft) error synthesizing');
|
|
stats.increment('tts.count', ['vendor:microsoft', 'accepted:no']);
|
|
synthesizer.close();
|
|
reject(err);
|
|
});
|
|
});
|
|
} catch (err) {
|
|
logger.info({err}, 'synthAudio: Error synthesizing speech using Microsoft');
|
|
stats.increment('tts.count', ['vendor:google', 'accepted:no']);
|
|
}
|
|
};
|
|
|
|
const synthWellSaid = async(logger, {credentials, stats, language, voice, gender, text}) => {
|
|
const {api_key} = credentials;
|
|
try {
|
|
const post = bent('https://api.wellsaidlabs.com', 'POST', 'buffer', {
|
|
'X-Api-Key': api_key,
|
|
'Accept': 'audio/mpeg',
|
|
'Content-Type': 'application/json'
|
|
});
|
|
const audioContent = await post('/v1/tts/stream', {
|
|
text,
|
|
speaker_id: voice
|
|
});
|
|
return {
|
|
audioContent,
|
|
extension: 'mp3',
|
|
sampleRate: 8000
|
|
};
|
|
} catch (err) {
|
|
logger.info({err}, 'testWellSaidTts returned error');
|
|
throw err;
|
|
}
|
|
};
|
|
|
|
const synthNuance = async(client, logger, {credentials, stats, voice, model, text}) => {
|
|
let nuanceClient;
|
|
const {client_id, secret, nuance_tts_uri} = credentials;
|
|
if (nuance_tts_uri) {
|
|
nuanceClient = await createKryptonClient(nuance_tts_uri);
|
|
}
|
|
else {
|
|
/* get a nuance access token */
|
|
const {access_token} = await getNuanceAccessToken(client, logger, client_id, secret, 'tts');
|
|
nuanceClient = await createNuanceClient(access_token);
|
|
}
|
|
|
|
const v = new Voice();
|
|
const p = new AudioParameters();
|
|
const f = new AudioFormat();
|
|
const pcm = new PCM();
|
|
const params = new EventParameters();
|
|
const request = new SynthesisRequest();
|
|
const input = new Input();
|
|
|
|
if (text.startsWith('<speak')) {
|
|
const ssml = new SSML();
|
|
ssml.setText(text);
|
|
input.setSsml(ssml);
|
|
}
|
|
else {
|
|
const t = new Text();
|
|
t.setText(text);
|
|
input.setText(t);
|
|
}
|
|
const sampleRate = 8000;
|
|
pcm.setSampleRateHz(sampleRate);
|
|
f.setPcm(pcm);
|
|
p.setAudioFormat(f);
|
|
v.setName(voice);
|
|
v.setModel(model);
|
|
request.setVoice(v);
|
|
request.setAudioParams(p);
|
|
request.setInput(input);
|
|
request.setEventParams(params);
|
|
request.setUserId('jambonz');
|
|
|
|
return new Promise((resolve, reject) => {
|
|
nuanceClient.unarySynthesize(request, (err, response) => {
|
|
if (err) {
|
|
console.error(err);
|
|
return reject(err);
|
|
}
|
|
const status = response.getStatus();
|
|
const code = status.getCode();
|
|
if (code !== 200) {
|
|
const message = status.getMessage();
|
|
const details = status.getDetails();
|
|
return reject({code, message, details});
|
|
}
|
|
resolve({
|
|
audioContent: Buffer.from(response.getAudio()),
|
|
extension: 'r8',
|
|
sampleRate
|
|
});
|
|
});
|
|
});
|
|
};
|
|
|
|
const synthNvidia = async(client, logger, {
|
|
credentials, stats, language, voice, model, key, text, renderForCaching, disableTtsStreaming, disableTtsCache
|
|
}) => {
|
|
const {riva_server_uri} = credentials;
|
|
if (!JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) {
|
|
let params = '';
|
|
params += `{riva_server_uri=${riva_server_uri}`;
|
|
params += `,playback_id=${key}`;
|
|
params += `,voice=${voice}`;
|
|
params += `,language=${language}`;
|
|
params += `,write_cache_file=${disableTtsCache ? 0 : 1}`;
|
|
params += '}';
|
|
|
|
return {
|
|
filePath: `say:${params}${text.replace(/\n/g, ' ')}`,
|
|
servedFromCache: false,
|
|
rtt: 0
|
|
};
|
|
}
|
|
let rivaClient, request;
|
|
const sampleRate = 8000;
|
|
try {
|
|
rivaClient = await createRivaClient(riva_server_uri);
|
|
request = new SynthesizeSpeechRequest();
|
|
request.setVoiceName(voice);
|
|
request.setLanguageCode(language);
|
|
request.setSampleRateHz(sampleRate);
|
|
request.setEncoding(AudioEncoding.LINEAR_PCM);
|
|
request.setText(text);
|
|
} catch (err) {
|
|
logger.info({err}, 'error creating riva client');
|
|
return Promise.reject(err);
|
|
}
|
|
|
|
return new Promise((resolve, reject) => {
|
|
rivaClient.synthesize(request, (err, response) => {
|
|
if (err) {
|
|
logger.info({err, voice, language}, 'error synthesizing speech using Nvidia');
|
|
return reject(err);
|
|
}
|
|
resolve({
|
|
audioContent: Buffer.from(response.getAudio()),
|
|
extension: 'r8',
|
|
sampleRate
|
|
});
|
|
});
|
|
});
|
|
};
|
|
|
|
|
|
const synthCustomVendor = async(logger, {credentials, stats, language, voice,
|
|
text, filePath, renderForCaching, disableTtsStreaming, key, disableTtsCache}) => {
|
|
const {vendor, auth_token, custom_tts_url} = credentials;
|
|
|
|
if (!JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) {
|
|
let params = '{';
|
|
params += `auth_token=${auth_token}`;
|
|
params += `,playback_id=${key}`;
|
|
params += `,custom_tts_url=${custom_tts_url}`;
|
|
params += ',vendor=custom';
|
|
params += `,voice=${voice}`;
|
|
params += `,write_cache_file=${disableTtsCache ? 0 : 1}`;
|
|
params += '}';
|
|
|
|
return {
|
|
filePath: `say:${params}${text.replace(/\n/g, ' ').replace(/\r/g, ' ')}`,
|
|
servedFromCache: false,
|
|
rtt: 0
|
|
};
|
|
}
|
|
|
|
try {
|
|
const post = bent('POST', {
|
|
'Authorization': `Bearer ${auth_token}`,
|
|
'Content-Type': 'application/json'
|
|
});
|
|
|
|
const response = await post(custom_tts_url, {
|
|
language,
|
|
voice,
|
|
type: text.startsWith('<speak>') ? 'ssml' : 'text',
|
|
text
|
|
});
|
|
|
|
const mime = response.headers['content-type'];
|
|
const buffer = await response.arrayBuffer();
|
|
const [extension, sampleRate] = getFileExtFromMime(mime);
|
|
return {
|
|
audioContent: buffer,
|
|
extension,
|
|
sampleRate
|
|
};
|
|
} catch (err) {
|
|
logger.info({err}, `Vendor ${vendor} returned error`);
|
|
throw err;
|
|
}
|
|
};
|
|
|
|
const synthElevenlabs = async(logger, {
|
|
credentials, options, stats, voice, key, text, renderForCaching, disableTtsStreaming, disableTtsCache
|
|
}) => {
|
|
const {api_key, model_id, api_uri, options: credOpts} = credentials;
|
|
const opts = !!options && Object.keys(options).length !== 0 ? options : JSON.parse(credOpts || '{}');
|
|
|
|
/* default to using the streaming interface, unless disabled by env var OR we want just a cache file */
|
|
if (!JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) {
|
|
let params = '{';
|
|
params += `api_key=${api_key}`;
|
|
params += `,playback_id=${key}`;
|
|
params += ',vendor=elevenlabs';
|
|
params += `,voice=${voice}`;
|
|
params += `,model_id=${model_id}`;
|
|
params += `,write_cache_file=${disableTtsCache ? 0 : 1}`;
|
|
if (api_uri) params += `,api_uri=${api_uri}`;
|
|
if (opts.optimize_streaming_latency !== null && opts.optimize_streaming_latency !== undefined) {
|
|
params += `,optimize_streaming_latency=${opts.optimize_streaming_latency}`;
|
|
}
|
|
if (opts.voice_settings?.similarity_boost) params += `,similarity_boost=${opts.voice_settings.similarity_boost}`;
|
|
if (opts.voice_settings?.stability) params += `,stability=${opts.voice_settings.stability}`;
|
|
if (opts.voice_settings?.style) params += `,style=${opts.voice_settings.style}`;
|
|
if (opts.voice_settings?.speed !== null && opts.voice_settings?.speed !== undefined)
|
|
params += `,speed=${opts.voice_settings.speed}`;
|
|
if (opts.voice_settings?.use_speaker_boost === false) params += ',use_speaker_boost=false';
|
|
if (opts.previous_text) params += `,previous_text=${opts.previous_text}`;
|
|
if (opts.next_text) params += `,next_text=${opts.next_text}`;
|
|
if (opts.pronunciation_dictionary_locators && Array.isArray(opts.pronunciation_dictionary_locators))
|
|
params += `,pronunciation_dictionary_locators=${JSON.stringify(opts.pronunciation_dictionary_locators)}`;
|
|
params += '}';
|
|
|
|
return {
|
|
filePath: `say:${params}${text.replace(/\n/g, ' ').replace(/\r/g, ' ')}`,
|
|
servedFromCache: false,
|
|
rtt: 0
|
|
};
|
|
}
|
|
|
|
const optimize_streaming_latency = opts.optimize_streaming_latency ?
|
|
`?optimize_streaming_latency=${opts.optimize_streaming_latency}` : '';
|
|
try {
|
|
const post = bent(`https://${api_uri}`, 'POST', 'buffer', {
|
|
'xi-api-key': api_key,
|
|
'Accept': 'audio/mpeg',
|
|
'Content-Type': 'application/json'
|
|
});
|
|
const audioContent = await post(`/v1/text-to-speech/${voice}${optimize_streaming_latency}`, {
|
|
text,
|
|
model_id,
|
|
voice_settings: {
|
|
stability: 0.5,
|
|
similarity_boost: 0.5
|
|
},
|
|
...opts
|
|
});
|
|
return {
|
|
audioContent,
|
|
extension: 'mp3',
|
|
sampleRate: 8000
|
|
};
|
|
} catch (err) {
|
|
logger.info({err}, 'synth Elevenlabs returned error');
|
|
stats.increment('tts.count', ['vendor:elevenlabs', 'accepted:no']);
|
|
throw err;
|
|
}
|
|
};
|
|
|
|
const synthPlayHT = async(client, logger, {
|
|
credentials, options, stats, voice, language, key, text, renderForCaching, disableTtsStreaming, disableTtsCache
|
|
}) => {
|
|
const {api_key, user_id, voice_engine, playht_tts_uri, options: credOpts} = credentials;
|
|
const opts = !!options && Object.keys(options).length !== 0 ? options : JSON.parse(credOpts || '{}');
|
|
|
|
let synthesizeUrl = playht_tts_uri ? `${playht_tts_uri}/api/v2/tts/stream` : 'https://api.play.ht/api/v2/tts/stream';
|
|
// If model is play3.0, the synthesizeUrl is got from authentication endpoint
|
|
if (voice_engine === 'Play3.0') {
|
|
try {
|
|
const post = bent('https://api.play.ht', 'POST', 'json', 201, {
|
|
'AUTHORIZATION': api_key,
|
|
'X-USER-ID': user_id,
|
|
'Accept': 'application/json'
|
|
});
|
|
const key = makePlayhtKey(api_key);
|
|
const url = await client.get(key);
|
|
if (!url) {
|
|
const {inference_address, expires_at_ms} = await post('/api/v3/auth');
|
|
synthesizeUrl = inference_address;
|
|
const expiry = Math.floor((expires_at_ms - Date.now()) / 1000 - 30);
|
|
await client.set(key, inference_address, 'EX', expiry);
|
|
} else {
|
|
// Use cached URL
|
|
synthesizeUrl = url;
|
|
}
|
|
} catch (err) {
|
|
logger.info({err}, 'synth PlayHT returned error for authentication version 3.0');
|
|
stats.increment('tts.count', ['vendor:playht', 'accepted:no']);
|
|
throw err;
|
|
}
|
|
}
|
|
|
|
/* default to using the streaming interface, unless disabled by env var OR we want just a cache file */
|
|
if (!JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) {
|
|
let params = '{';
|
|
params += `api_key=${api_key}`;
|
|
params += `,playback_id=${key}`;
|
|
params += `,user_id=${user_id}`;
|
|
params += ',vendor=playht';
|
|
params += `,voice=${voice}`;
|
|
params += `,voice_engine=${voice_engine}`;
|
|
params += `,synthesize_url=${synthesizeUrl}`;
|
|
params += `,write_cache_file=${disableTtsCache ? 0 : 1}`;
|
|
params += `,language=${language}`;
|
|
if (opts.quality) params += `,quality=${opts.quality}`;
|
|
if (opts.speed) params += `,speed=${opts.speed}`;
|
|
if (opts.seed) params += `,style=${opts.seed}`;
|
|
if (opts.temperature) params += `,temperature=${opts.temperature}`;
|
|
if (opts.emotion) params += `,emotion=${opts.emotion}`;
|
|
if (opts.voice_guidance) params += `,voice_guidance=${opts.voice_guidance}`;
|
|
if (opts.style_guidance) params += `,style_guidance=${opts.style_guidance}`;
|
|
if (opts.text_guidance) params += `,text_guidance=${opts.text_guidance}`;
|
|
if (opts.top_p) params += `,top_p=${opts.top_p}`;
|
|
if (opts.repetition_penalty) params += `,repetition_penalty=${opts.repetition_penalty}`;
|
|
params += '}';
|
|
|
|
return {
|
|
filePath: `say:${params}${text.replace(/\n/g, ' ').replace(/\r/g, ' ')}`,
|
|
servedFromCache: false,
|
|
rtt: 0
|
|
};
|
|
}
|
|
|
|
try {
|
|
const post = bent('POST', 'buffer', {
|
|
...(voice_engine !== 'Play3.0' && {
|
|
'AUTHORIZATION': api_key,
|
|
'X-USER-ID': user_id,
|
|
}),
|
|
'Accept': 'audio/mpeg',
|
|
'Content-Type': 'application/json'
|
|
});
|
|
|
|
const audioContent = await post(synthesizeUrl, {
|
|
text,
|
|
...(voice_engine === 'Play3.0' && { language }),
|
|
voice,
|
|
voice_engine,
|
|
output_format: 'mp3',
|
|
sample_rate: 8000,
|
|
...opts
|
|
});
|
|
return {
|
|
audioContent,
|
|
extension: 'mp3',
|
|
sampleRate: 8000
|
|
};
|
|
} catch (err) {
|
|
logger.info({err}, 'synth PlayHT returned error');
|
|
stats.increment('tts.count', ['vendor:playht', 'accepted:no']);
|
|
throw err;
|
|
}
|
|
};
|
|
|
|
const synthInworld = async(logger, {
|
|
credentials, options, stats, voice, key, text, renderForCaching, disableTtsStreaming, disableTtsCache
|
|
}) => {
|
|
const {api_key, model_id, options: credOpts} = credentials;
|
|
const opts = !!options && Object.keys(options).length !== 0 ? options : JSON.parse(credOpts || '{}');
|
|
|
|
/* default to using the streaming interface, unless disabled by env var OR we want just a cache file */
|
|
if (!JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) {
|
|
let params = '{';
|
|
params += `api_key=${api_key}`;
|
|
params += `,playback_id=${key}`;
|
|
params += `,model_id=${model_id}`;
|
|
params += ',vendor=inworld';
|
|
params += `,voice=${voice}`;
|
|
params += `,write_cache_file=${disableTtsCache ? 0 : 1}`;
|
|
if (opts.temperature) params += `,temperature=${opts.temperature}`;
|
|
if (opts.audioConfig?.pitch) params += `,pitch=${opts.pitch}`;
|
|
if (opts.audioConfig?.speakingRate) params += `,speakingRate=${opts.speakingRate}`;
|
|
params += '}';
|
|
|
|
return {
|
|
filePath: `say:${params}${text.replace(/\n/g, ' ').replace(/\r/g, ' ')}`,
|
|
servedFromCache: false,
|
|
rtt: 0
|
|
};
|
|
}
|
|
|
|
try {
|
|
const url = 'https://api.inworld.ai/tts/v1/voice';
|
|
const sampleRate = 8000;
|
|
const options = {
|
|
method: 'POST',
|
|
headers: {
|
|
'Authorization': `Basic ${api_key}`,
|
|
'Content-Type': 'application/json'
|
|
},
|
|
body: JSON.stringify({
|
|
text,
|
|
voiceId: voice,
|
|
modelId: model_id,
|
|
audioConfig: {
|
|
...(opts.audioConfig || {}),
|
|
audioEncoding: 'MP3',
|
|
}
|
|
})
|
|
};
|
|
|
|
const response = await fetch(url, options);
|
|
|
|
if (!response.ok) {
|
|
throw new Error(await response.text());
|
|
}
|
|
const json = await response.json();
|
|
return {
|
|
audioContent: Buffer.from(json.audioContent, 'base64'),
|
|
extension: 'mp3',
|
|
sampleRate
|
|
};
|
|
} catch (err) {
|
|
logger.info({err}, 'synth inworld returned error');
|
|
stats.increment('tts.count', ['vendor:inworld', 'accepted:no']);
|
|
throw err;
|
|
}
|
|
};
|
|
|
|
const synthRimelabs = async(logger, {
|
|
credentials, options, stats, language, voice, key, text, renderForCaching, disableTtsStreaming, disableTtsCache
|
|
}) => {
|
|
const {api_key, model_id, options: credOpts} = credentials;
|
|
const opts = !!options && Object.keys(options).length !== 0 ? options : JSON.parse(credOpts || '{}');
|
|
|
|
/* default to using the streaming interface, unless disabled by env var OR we want just a cache file */
|
|
if (!JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) {
|
|
let params = '{';
|
|
params += `api_key=${api_key}`;
|
|
params += `,playback_id=${key}`;
|
|
params += `,model_id=${model_id}`;
|
|
params += ',vendor=rimelabs';
|
|
params += `,language=${language}`;
|
|
params += `,voice=${voice}`;
|
|
params += `,write_cache_file=${disableTtsCache ? 0 : 1}`;
|
|
if (opts.speedAlpha) params += `,speed_alpha=${opts.speedAlpha}`;
|
|
if (opts.reduceLatency) params += `,reduce_latency=${opts.reduceLatency}`;
|
|
// Arcana model parameters
|
|
if (opts.temperature) params += `,temperature=${opts.temperature}`;
|
|
if (opts.repetition_penalty) params += `,repetition_penalty=${opts.repetition_penalty}`;
|
|
if (opts.top_p) params += `,top_p=${opts.top_p}`;
|
|
if (opts.max_tokens) params += `,max_tokens=${opts.max_tokens}`;
|
|
params += '}';
|
|
|
|
return {
|
|
filePath: `say:${params}${text.replace(/\n/g, ' ').replace(/\r/g, ' ')}`,
|
|
servedFromCache: false,
|
|
rtt: 0
|
|
};
|
|
}
|
|
|
|
try {
|
|
const post = bent('https://users.rime.ai', 'POST', 'buffer', {
|
|
'Authorization': `Bearer ${api_key}`,
|
|
'Accept': 'audio/mp3',
|
|
'Content-Type': 'application/json'
|
|
});
|
|
const sampleRate = 8000;
|
|
const audioContent = await post('/v1/rime-tts', {
|
|
speaker: voice,
|
|
text,
|
|
modelId: model_id,
|
|
samplingRate: sampleRate,
|
|
lang: language,
|
|
...opts
|
|
});
|
|
return {
|
|
audioContent,
|
|
extension: 'mp3',
|
|
sampleRate
|
|
};
|
|
} catch (err) {
|
|
logger.info({err}, 'synth rimelabs returned error');
|
|
stats.increment('tts.count', ['vendor:rimelabs', 'accepted:no']);
|
|
throw err;
|
|
}
|
|
};
|
|
const synthVerbio = async(client, logger, {
|
|
credentials, stats, voice, key, text, renderForCaching, disableTtsStreaming, disableTtsCache
|
|
}) => {
|
|
//https://doc.speechcenter.verbio.com/#tag/Text-To-Speech-REST-API
|
|
if (text.length > 2000) {
|
|
throw new Error('Verbio cannot synthesize for the text length larger than 2000 characters');
|
|
}
|
|
const token = await getVerbioAccessToken(client, logger, credentials);
|
|
if (!process.env.JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) {
|
|
let params = '{';
|
|
params += `access_token=${token.access_token}`;
|
|
params += `,playback_id=${key}`;
|
|
params += ',vendor=verbio';
|
|
params += `,voice=${voice}`;
|
|
params += `,write_cache_file=${disableTtsCache ? 0 : 1}`;
|
|
params += '}';
|
|
|
|
return {
|
|
filePath: `say:${params}${text.replace(/\n/g, ' ')}`,
|
|
servedFromCache: false,
|
|
rtt: 0
|
|
};
|
|
}
|
|
|
|
try {
|
|
const post = bent('https://us.rest.speechcenter.verbio.com', 'POST', 'buffer', {
|
|
'Authorization': `Bearer ${token.access_token}`,
|
|
'User-Agent': 'jambonz',
|
|
'Content-Type': 'application/json'
|
|
});
|
|
const audioContent = await post('/api/v1/synthesize', {
|
|
voice_id: voice,
|
|
output_sample_rate: '8k',
|
|
output_encoding: 'pcm16',
|
|
text
|
|
});
|
|
return {
|
|
audioContent,
|
|
extension: 'r8',
|
|
sampleRate: 8000
|
|
};
|
|
} catch (err) {
|
|
logger.info({err}, 'synth Verbio returned error');
|
|
stats.increment('tts.count', ['vendor:verbio', 'accepted:no']);
|
|
throw err;
|
|
}
|
|
};
|
|
|
|
const synthWhisper = async(logger, {credentials, stats, voice, key, text, instructions,
|
|
renderForCaching, disableTtsStreaming, disableTtsCache}) => {
|
|
const {api_key, model_id, baseURL, timeout, speed} = credentials;
|
|
/* if the env is set to stream then bag out, unless we are specifically rendering to generate a cache file */
|
|
if (!JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) {
|
|
let params = '{';
|
|
params += `api_key=${api_key}`;
|
|
params += `,playback_id=${key}`;
|
|
params += `,model_id=${model_id}`;
|
|
params += ',vendor=whisper';
|
|
params += `,voice=${voice}`;
|
|
params += `,write_cache_file=${disableTtsCache ? 0 : 1}`;
|
|
if (speed) params += `,speed=${speed}`;
|
|
// comma is used to separated parameters in freeswitch tts module
|
|
if (instructions) params += `,instructions=${instructions.replace(/\n/g, ' ').replace(/,/g, ';')}`;
|
|
params += '}';
|
|
|
|
return {
|
|
filePath: `say:${params}${text.replace(/\n/g, ' ')}`,
|
|
servedFromCache: false,
|
|
rtt: 0
|
|
};
|
|
}
|
|
try {
|
|
const openai = new OpenAI.OpenAI({
|
|
apiKey: api_key,
|
|
timeout: timeout || 5000,
|
|
...(baseURL && {baseURL})
|
|
});
|
|
|
|
const mp3 = await openai.audio.speech.create({
|
|
model: model_id,
|
|
voice,
|
|
input: text,
|
|
...(instructions && {instructions}),
|
|
response_format: 'mp3'
|
|
});
|
|
return {
|
|
audioContent: Buffer.from(await mp3.arrayBuffer()),
|
|
extension: 'mp3',
|
|
sampleRate: 8000
|
|
};
|
|
} catch (err) {
|
|
logger.info({err}, 'synth whisper returned error');
|
|
stats.increment('tts.count', ['vendor:openai', 'accepted:no']);
|
|
throw err;
|
|
}
|
|
};
|
|
|
|
const synthDeepgram = async(logger, {credentials, stats, model, key, text, renderForCaching,
|
|
disableTtsStreaming, disableTtsCache}) => {
|
|
const {api_key, deepgram_tts_uri} = credentials;
|
|
if (!JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) {
|
|
let params = '{';
|
|
params += `api_key=${api_key}`;
|
|
params += `,playback_id=${key}`;
|
|
params += ',vendor=deepgram';
|
|
params += `,voice=${model}`;
|
|
params += `,write_cache_file=${disableTtsCache ? 0 : 1}`;
|
|
if (deepgram_tts_uri) params += `,endpoint=${deepgram_tts_uri}`;
|
|
params += '}';
|
|
|
|
return {
|
|
filePath: `say:${params}${text.replace(/\n/g, ' ')}`,
|
|
servedFromCache: false,
|
|
rtt: 0
|
|
};
|
|
}
|
|
try {
|
|
const post = bent(deepgram_tts_uri || 'https://api.deepgram.com', 'POST', 'buffer', {
|
|
// on-premise deepgram does not require to have api_key
|
|
...(api_key && {'Authorization': `Token ${api_key}`}),
|
|
'Accept': 'audio/mpeg',
|
|
'Content-Type': 'application/json'
|
|
});
|
|
const audioContent = await post(`/v1/speak?model=${model}`, {
|
|
text
|
|
});
|
|
return {
|
|
audioContent,
|
|
extension: 'mp3',
|
|
sampleRate: 8000
|
|
};
|
|
} catch (err) {
|
|
logger.info({err}, 'synth Deepgram returned error');
|
|
stats.increment('tts.count', ['vendor:deepgram', 'accepted:no']);
|
|
throw err;
|
|
}
|
|
};
|
|
|
|
const synthCartesia = async(logger, {
|
|
credentials, options, stats, voice, language, key, text, renderForCaching, disableTtsStreaming, disableTtsCache
|
|
}) => {
|
|
const {api_key, model_id, embedding, options: credOpts} = credentials;
|
|
const opts = !!options && Object.keys(options).length !== 0 ? options : JSON.parse(credOpts || '{}');
|
|
|
|
if (!JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) {
|
|
let params = '{';
|
|
params += `api_key=${api_key}`;
|
|
params += `,playback_id=${key}`;
|
|
params += `,model_id=${model_id}`;
|
|
params += ',vendor=cartesia';
|
|
params += `,voice=${voice}`;
|
|
params += `,write_cache_file=${disableTtsCache ? 0 : 1}`;
|
|
params += `,language=${language}`;
|
|
params += `,voice_mode=${embedding ? 'embedding' : 'id'}`;
|
|
if (embedding) params += `,embedding=${embedding}`;
|
|
if (opts.speed) params += `,speed=${opts.speed}`;
|
|
if (opts.emotion) params += `,emotion=${opts.emotion}`;
|
|
if (opts.volume) params += `,volume=${opts.volume}`;
|
|
params += '}';
|
|
|
|
return {
|
|
filePath: `say:${params}${text.replace(/\n/g, ' ').replace(/\r/g, ' ')}`,
|
|
servedFromCache: false,
|
|
rtt: 0
|
|
};
|
|
}
|
|
|
|
try {
|
|
const client = new CartesiaClient({ apiKey: api_key });
|
|
const sampleRate = 48000;
|
|
const mp3Stream = await client.tts.bytes({
|
|
modelId: model_id,
|
|
transcript: text,
|
|
voice: {
|
|
mode: embedding ? 'embedding' : 'id',
|
|
...(embedding ?
|
|
{
|
|
embedding: embedding.split(',').map(Number)
|
|
} :
|
|
{
|
|
id: voice
|
|
}
|
|
),
|
|
...(model_id === 'sonic-2' && (opts.speed || opts.emotion) && {
|
|
experimentalControls: {
|
|
...(opts.speed !== null && opts.speed !== undefined && {speed: opts.speed}),
|
|
...(opts.emotion && {emotion: [opts.emotion]}),
|
|
}
|
|
}),
|
|
},
|
|
...(model_id === 'sonic-3' && (opts.speed || opts.emotion || opts.volume) && {
|
|
generationConfig: {
|
|
...(opts.volume !== null && opts.volume !== undefined && {volume: opts.volume}),
|
|
...(opts.speed !== null && opts.speed !== undefined && {speed: opts.speed}),
|
|
...(opts.emotion !== null && opts.emotion !== undefined && {emotion: opts.emotion}),
|
|
}
|
|
}),
|
|
language: language,
|
|
outputFormat: {
|
|
container: 'mp3',
|
|
bitRate: 128000,
|
|
sampleRate
|
|
},
|
|
});
|
|
|
|
// bytes() returns a ReadableStream - collect all chunks
|
|
const chunks = [];
|
|
for await (const chunk of mp3Stream) {
|
|
chunks.push(chunk);
|
|
}
|
|
const audioBuffer = Buffer.concat(chunks);
|
|
|
|
return {
|
|
audioContent: audioBuffer,
|
|
extension: 'mp3',
|
|
sampleRate
|
|
};
|
|
} catch (err) {
|
|
logger.info({err}, 'synth Cartesia returned error');
|
|
stats.increment('tts.count', ['vendor:cartesia', 'accepted:no']);
|
|
throw err;
|
|
}
|
|
|
|
};
|
|
|
|
const synthResemble = async(logger, {
|
|
credentials, options, stats, voice, key, text, renderForCaching, disableTtsStreaming, disableTtsCache
|
|
}) => {
|
|
const {api_key, resemble_tts_uri, resemble_tts_use_tls} = credentials;
|
|
const {project_uuid, use_hd} = options || {};
|
|
|
|
/* default to using the streaming interface, unless disabled by env var OR we want just a cache file */
|
|
if (!JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) {
|
|
let params = '{';
|
|
params += `api_key=${api_key}`;
|
|
params += `,playback_id=${key}`;
|
|
params += ',vendor=resemble';
|
|
params += `,voice=${voice}`;
|
|
params += `,write_cache_file=${disableTtsCache ? 0 : 1}`;
|
|
if (project_uuid) params += `,project_uuid=${project_uuid}`;
|
|
if (use_hd) params += `,use_hd=${use_hd}`;
|
|
if (resemble_tts_uri) params += `,endpoint=${resemble_tts_uri}`;
|
|
if (resemble_tts_use_tls) params += `,use_tls=${resemble_tts_use_tls}`;
|
|
|
|
params += '}';
|
|
|
|
return {
|
|
filePath: `say:${params}${text.replace(/\n/g, ' ').replace(/\r/g, ' ')}`,
|
|
servedFromCache: false,
|
|
rtt: 0
|
|
};
|
|
}
|
|
|
|
try {
|
|
const baseUrl = resemble_tts_uri || 'https://f.cluster.resemble.ai';
|
|
const post = bent(baseUrl, 'POST', 'buffer', {
|
|
'Authorization': `Bearer ${api_key}`,
|
|
'Content-Type': 'application/json'
|
|
});
|
|
const response = await post('/synthesize', {
|
|
voice_uuid: voice,
|
|
data: text,
|
|
sample_rate: 8000,
|
|
output_format: 'mp3',
|
|
...(project_uuid && {project_uuid}),
|
|
...(use_hd && {use_hd}),
|
|
});
|
|
|
|
const json = JSON.parse(response.toString('utf8'));
|
|
const audioContent = Buffer.from(json.audio_content, 'base64');
|
|
return {
|
|
audioContent,
|
|
extension: 'mp3',
|
|
sampleRate: 8000
|
|
};
|
|
} catch (err) {
|
|
logger.info({err}, 'synth Elevenlabs returned error');
|
|
stats.increment('tts.count', ['vendor:elevenlabs', 'accepted:no']);
|
|
throw err;
|
|
}
|
|
};
|
|
|
|
const getFileExtFromMime = (mime) => {
|
|
switch (mime) {
|
|
case 'audio/wav':
|
|
case 'audio/x-wav':
|
|
return ['wav', 8000];
|
|
case /audio\/l16.*rate=8000/.test(mime) ? mime : 'cant match value':
|
|
return ['r8', 8000];
|
|
case /audio\/l16.*rate=16000/.test(mime) ? mime : 'cant match value':
|
|
return ['r16', 16000];
|
|
case /audio\/l16.*rate=24000/.test(mime) ? mime : 'cant match value':
|
|
return ['r24', 24000];
|
|
case /audio\/l16.*rate=32000/.test(mime) ? mime : 'cant match value':
|
|
return ['r32', 32000];
|
|
case /audio\/l16.*rate=48000/.test(mime) ? mime : 'cant match value':
|
|
return ['r48', 48000];
|
|
case 'audio/mpeg':
|
|
case 'audio/mp3':
|
|
return ['mp3', 8000];
|
|
default:
|
|
return ['wav', 8000];
|
|
}
|
|
};
|
|
|
|
module.exports = synthAudio;
|