From 1a04fd736cce0e2e2915c5b3b258c6cbbf4f1389 Mon Sep 17 00:00:00 2001 From: Quan HL Date: Fri, 27 Sep 2024 12:08:41 +0700 Subject: [PATCH 1/4] support playht3.0 --- lib/synth-audio.js | 35 ++++++++++++++++++++++++++++++++--- lib/utils.js | 6 ++++++ test/synth.js | 22 +++++++++++++++------- 3 files changed, 53 insertions(+), 10 deletions(-) diff --git a/lib/synth-audio.js b/lib/synth-audio.js index 808a41a..ea94b34 100644 --- a/lib/synth-audio.js +++ b/lib/synth-audio.js @@ -20,7 +20,8 @@ const { createKryptonClient, createRivaClient, noopLogger, - makeFilePath + makeFilePath, + makePlayhtKey } = require('./utils'); const getNuanceAccessToken = require('./get-nuance-access-token'); const getVerbioAccessToken = require('./get-verbio-token'); @@ -244,7 +245,7 @@ async function synthAudio(client, createHash, retrieveHash, logger, stats, { acc }); break; case 'playht': - audioBuffer = await synthPlayHT(logger, { + audioBuffer = await synthPlayHT(client, logger, { credentials, options, stats, language, voice, text, renderForCaching, disableTtsStreaming, filePath }); break; @@ -755,12 +756,38 @@ const synthElevenlabs = async(logger, { } }; -const synthPlayHT = async(logger, { +const synthPlayHT = async(client, logger, { credentials, options, stats, voice, text, renderForCaching, disableTtsStreaming }) => { const {api_key, user_id, voice_engine, options: credOpts} = credentials; const opts = !!options && Object.keys(options).length !== 0 ? options : JSON.parse(credOpts || '{}'); + let synthesizeUrl = 'https://api.play.ht/api/v2/tts/stream'; + + // If model is play3.0, the stream url is provided by v3 auth endpoint which is + // including jwt token as request params. + if (voice_engine === 'Play3.0') { + try { + const post = bent('https://api.play.ht', 'POST', 'json', 201, { + 'AUTHORIZATION': api_key, + 'X-USER-ID': user_id, + 'Accept': 'application/json' + }); + const key = makePlayhtKey(api_key); + const url = await client.get(key); + if (!url) { + const {inference_address, expires_at_ms} = await post('/api/v3/auth'); + synthesizeUrl = inference_address; + const expiry = Math.floor((expires_at_ms - Date.now()) / 1000 - 30); + await client.set(key, inference_address, 'EX', expiry); + } + } catch (err) { + logger.info({err}, 'synth PlayHT returned error for authentication version 3.0'); + stats.increment('tts.count', ['vendor:playht', 'accepted:no']); + throw err; + } + } + /* default to using the streaming interface, unless disabled by env var OR we want just a cache file */ if (!JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) { let params = ''; @@ -769,6 +796,7 @@ const synthPlayHT = async(logger, { params += ',vendor=playht'; params += `,voice=${voice}`; params += `,voice_engine=${voice_engine}`; + params += `,synthesize_url=${synthesizeUrl}`; params += ',write_cache_file=1'; if (opts.quality) params += `,quality=${opts.quality}`; if (opts.speed) params += `,speed=${opts.speed}`; @@ -794,6 +822,7 @@ const synthPlayHT = async(logger, { 'Accept': 'audio/mpeg', 'Content-Type': 'application/json' }); + const mp3 = await post('/api/v2/tts/stream', { text, voice, diff --git a/lib/utils.js b/lib/utils.js index a6b5970..721968a 100644 --- a/lib/utils.js +++ b/lib/utils.js @@ -98,6 +98,11 @@ function makeAwsKey(awsAccessKeyId) { return `aws:${hash.digest('hex')}`; } +function makePlayhtKey(apiKey) { + const hash = crypto.createHash('sha1'); + hash.update(apiKey); + return `playht:${hash.digest('hex')}`; +} function makeVerbioKey(client_id) { const hash = crypto.createHash('sha1'); hash.update(client_id); @@ -171,6 +176,7 @@ module.exports = { makeSynthKey, makeNuanceKey, makeIbmKey, + makePlayhtKey, makeAwsKey, makeVerbioKey, getNuanceAccessToken, diff --git a/test/synth.js b/test/synth.js index 1c80ff5..b9324fc 100644 --- a/test/synth.js +++ b/test/synth.js @@ -574,9 +574,9 @@ test('Elevenlabs speech synth tests', async(t) => { t.end(err); } client.quit(); -}) +}); -test('PlayHT speech synth tests', async(t) => { +const testPlayHT = async(t, voice_engine) => { const fn = require('..'); const {synthAudio, client} = fn(opts, logger); @@ -584,20 +584,20 @@ test('PlayHT speech synth tests', async(t) => { t.pass('skipping PlayHT speech synth tests since PLAYHT_API_KEY or PLAYHT_USER_ID is/are not provided'); return t.end(); } - const text = 'Hi there and welcome to jambones!'; + const text = 'Hi there and welcome to jambones! ' + Date.now(); try { - let opts = await synthAudio(stats, { + const opts = await synthAudio(stats, { vendor: 'playht', credentials: { api_key: process.env.PLAYHT_API_KEY, user_id: process.env.PLAYHT_USER_ID, - voice_engine: 'PlayHT2.0-turbo', + voice_engine, options: JSON.stringify({ - quality: "medium", + quality: 'medium', speed: 1, seed: 1, temperature: 1, - emotion: "female_happy", + emotion: 'female_happy', voice_guidance: 3, style_guidance: 20, text_guidance: 1, @@ -615,6 +615,14 @@ test('PlayHT speech synth tests', async(t) => { t.end(err); } client.quit(); +}; + +test('PlayHT speech synth tests', async(t) => { + await testPlayHT(t, 'PlayHT2.0-turbo'); +}); + +test('PlayHT3.0 speech synth tests', async(t) => { + await testPlayHT(t, 'Play3.0'); }); test('rimelabs speech synth tests', async(t) => { From 6794a0b3beea9625c268f3e954f90b1ba3ef80fa Mon Sep 17 00:00:00 2001 From: Quan HL Date: Fri, 27 Sep 2024 12:25:47 +0700 Subject: [PATCH 2/4] support playht3.0 --- lib/synth-audio.js | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/lib/synth-audio.js b/lib/synth-audio.js index ea94b34..84925ce 100644 --- a/lib/synth-audio.js +++ b/lib/synth-audio.js @@ -816,14 +816,16 @@ const synthPlayHT = async(client, logger, { } try { - const post = bent('https://api.play.ht', 'POST', 'buffer', { - 'AUTHORIZATION': api_key, - 'X-USER-ID': user_id, + const post = bent('POST', 'buffer', { + ...(voice_engine !== 'Play3.0' && { + 'AUTHORIZATION': api_key, + 'X-USER-ID': user_id, + }), 'Accept': 'audio/mpeg', 'Content-Type': 'application/json' }); - const mp3 = await post('/api/v2/tts/stream', { + const mp3 = await post(synthesizeUrl, { text, voice, voice_engine, From 05fc96edc0bbb22c1fb162cc1428cf00d103e71f Mon Sep 17 00:00:00 2001 From: Quan HL Date: Fri, 27 Sep 2024 18:24:03 +0700 Subject: [PATCH 3/4] wip --- lib/synth-audio.js | 3 ++- test/synth.js | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/lib/synth-audio.js b/lib/synth-audio.js index 84925ce..f20a982 100644 --- a/lib/synth-audio.js +++ b/lib/synth-audio.js @@ -757,7 +757,7 @@ const synthElevenlabs = async(logger, { }; const synthPlayHT = async(client, logger, { - credentials, options, stats, voice, text, renderForCaching, disableTtsStreaming + credentials, options, stats, voice, language, text, renderForCaching, disableTtsStreaming }) => { const {api_key, user_id, voice_engine, options: credOpts} = credentials; const opts = !!options && Object.keys(options).length !== 0 ? options : JSON.parse(credOpts || '{}'); @@ -827,6 +827,7 @@ const synthPlayHT = async(client, logger, { const mp3 = await post(synthesizeUrl, { text, + ...(voice_engine === 'Play3.0' && { language }), voice, voice_engine, output_format: 'mp3', diff --git a/test/synth.js b/test/synth.js index b9324fc..a160d2b 100644 --- a/test/synth.js +++ b/test/synth.js @@ -603,7 +603,7 @@ const testPlayHT = async(t, voice_engine) => { text_guidance: 1, }) }, - language: 'en-US', + language: 'english', voice: 's3://voice-cloning-zero-shot/d9ff78ba-d016-47f6-b0ef-dd630f59414e/female-cs/manifest.json', text, renderForCaching: true From f6cead6e92a3d29c8e339aa413ab580c9cdb05ee Mon Sep 17 00:00:00 2001 From: Quan HL Date: Thu, 3 Oct 2024 19:24:23 +0700 Subject: [PATCH 4/4] add top_p and repetition_penalty to playht3.0 --- lib/synth-audio.js | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/lib/synth-audio.js b/lib/synth-audio.js index f20a982..52ad754 100644 --- a/lib/synth-audio.js +++ b/lib/synth-audio.js @@ -764,8 +764,7 @@ const synthPlayHT = async(client, logger, { let synthesizeUrl = 'https://api.play.ht/api/v2/tts/stream'; - // If model is play3.0, the stream url is provided by v3 auth endpoint which is - // including jwt token as request params. + // If model is play3.0, the synthesizeUrl is got from authentication endpoint if (voice_engine === 'Play3.0') { try { const post = bent('https://api.play.ht', 'POST', 'json', 201, { @@ -806,6 +805,8 @@ const synthPlayHT = async(client, logger, { if (opts.voice_guidance) params += `,voice_guidance=${opts.voice_guidance}`; if (opts.style_guidance) params += `,style_guidance=${opts.style_guidance}`; if (opts.text_guidance) params += `,text_guidance=${opts.text_guidance}`; + if (opts.top_p) params += `,top_p=${opts.top_p}`; + if (opts.repetition_penalty) params += `,repetition_penalty=${opts.repetition_penalty}`; params += '}'; return {