Merge pull request #92 from jambonz/feat/playht30

support playht3.0
2026-01-25 02:08:26 +00:00 · 2024-10-09 13:26:53 -04:00
parent 1846203807 f6cead6e92
commit b0fee6bbf1
3 changed files with 63 additions and 16 deletions
--- a/lib/synth-audio.js
+++ b/lib/synth-audio.js
@@ -20,7 +20,8 @@ const {
  createKryptonClient,
  createRivaClient,
  noopLogger,
-  makeFilePath
+  makeFilePath,
+  makePlayhtKey
 } = require('./utils');
 const getNuanceAccessToken = require('./get-nuance-access-token');
 const getVerbioAccessToken = require('./get-verbio-token');
@@ -244,7 +245,7 @@ async function synthAudio(client, createHash, retrieveHash, logger, stats, { acc
        });
        break;
      case 'playht':
-        audioBuffer = await synthPlayHT(logger, {
+        audioBuffer = await synthPlayHT(client, logger, {
          credentials, options, stats, language, voice, text, renderForCaching, disableTtsStreaming, filePath
        });
        break;
@@ -755,12 +756,37 @@ const synthElevenlabs = async(logger, {
  }
 };

-const synthPlayHT = async(logger, {
-  credentials, options, stats, voice, text, renderForCaching, disableTtsStreaming
+const synthPlayHT = async(client, logger, {
+  credentials, options, stats, voice, language, text, renderForCaching, disableTtsStreaming
 }) => {
  const {api_key, user_id, voice_engine, options: credOpts} = credentials;
  const opts = !!options && Object.keys(options).length !== 0 ? options : JSON.parse(credOpts || '{}');

+  let synthesizeUrl = 'https://api.play.ht/api/v2/tts/stream';
+
+  // If model is play3.0, the synthesizeUrl is got from authentication endpoint
+  if (voice_engine === 'Play3.0') {
+    try {
+      const post = bent('https://api.play.ht', 'POST', 'json', 201, {
+        'AUTHORIZATION': api_key,
+        'X-USER-ID': user_id,
+        'Accept': 'application/json'
+      });
+      const key = makePlayhtKey(api_key);
+      const url = await client.get(key);
+      if (!url) {
+        const {inference_address, expires_at_ms} = await post('/api/v3/auth');
+        synthesizeUrl = inference_address;
+        const expiry =  Math.floor((expires_at_ms - Date.now()) / 1000 - 30);
+        await client.set(key, inference_address, 'EX', expiry);
+      }
+    } catch (err) {
+      logger.info({err}, 'synth PlayHT returned error for authentication version 3.0');
+      stats.increment('tts.count', ['vendor:playht', 'accepted:no']);
+      throw err;
+    }
+  }
+
  /* default to using the streaming interface, unless disabled by env var OR we want just a cache file */
  if (!JAMBONES_DISABLE_TTS_STREAMING && !renderForCaching && !disableTtsStreaming) {
    let params = '';
@@ -769,6 +795,7 @@ const synthPlayHT = async(logger, {
    params += ',vendor=playht';
    params += `,voice=${voice}`;
    params += `,voice_engine=${voice_engine}`;
+    params += `,synthesize_url=${synthesizeUrl}`;
    params += ',write_cache_file=1';
    if (opts.quality) params += `,quality=${opts.quality}`;
    if (opts.speed) params += `,speed=${opts.speed}`;
@@ -778,6 +805,8 @@ const synthPlayHT = async(logger, {
    if (opts.voice_guidance) params += `,voice_guidance=${opts.voice_guidance}`;
    if (opts.style_guidance) params += `,style_guidance=${opts.style_guidance}`;
    if (opts.text_guidance) params += `,text_guidance=${opts.text_guidance}`;
+    if (opts.top_p) params += `,top_p=${opts.top_p}`;
+    if (opts.repetition_penalty) params += `,repetition_penalty=${opts.repetition_penalty}`;
    params += '}';

    return {
@@ -788,14 +817,18 @@ const synthPlayHT = async(logger, {
  }

  try {
-    const post = bent('https://api.play.ht', 'POST', 'buffer', {
+    const post = bent('POST', 'buffer', {
+      ...(voice_engine !== 'Play3.0' && {
        'AUTHORIZATION': api_key,
        'X-USER-ID': user_id,
+      }),
      'Accept': 'audio/mpeg',
      'Content-Type': 'application/json'
    });
-    const mp3 = await post('/api/v2/tts/stream', {
+
+    const mp3 = await post(synthesizeUrl, {
      text,
+      ...(voice_engine === 'Play3.0' && { language }),
      voice,
      voice_engine,
      output_format: 'mp3',
--- a/lib/utils.js
+++ b/lib/utils.js
@@ -98,6 +98,11 @@ function makeAwsKey(awsAccessKeyId) {
  return `aws:${hash.digest('hex')}`;
 }

+function makePlayhtKey(apiKey) {
+  const hash = crypto.createHash('sha1');
+  hash.update(apiKey);
+  return `playht:${hash.digest('hex')}`;
+}
 function makeVerbioKey(client_id) {
  const hash = crypto.createHash('sha1');
  hash.update(client_id);
@@ -171,6 +176,7 @@ module.exports = {
  makeSynthKey,
  makeNuanceKey,
  makeIbmKey,
+  makePlayhtKey,
  makeAwsKey,
  makeVerbioKey,
  getNuanceAccessToken,
--- a/test/synth.js
+++ b/test/synth.js
@@ -574,9 +574,9 @@ test('Elevenlabs speech synth tests', async(t) => {
    t.end(err);
  }
  client.quit();
-})
+});

-test('PlayHT speech synth tests', async(t) => {
+const testPlayHT = async(t, voice_engine) => {
  const fn = require('..');
  const {synthAudio, client} = fn(opts, logger);

@@ -584,26 +584,26 @@ test('PlayHT speech synth tests', async(t) => {
    t.pass('skipping PlayHT speech synth tests since PLAYHT_API_KEY or PLAYHT_USER_ID is/are not provided');
    return t.end();
  }
-  const text = 'Hi there and welcome to jambones!';
+  const text = 'Hi there and welcome to jambones! ' + Date.now();
  try {
-    let opts = await synthAudio(stats, {
+    const opts = await synthAudio(stats, {
      vendor: 'playht',
      credentials: {
        api_key: process.env.PLAYHT_API_KEY,
        user_id: process.env.PLAYHT_USER_ID,
-        voice_engine: 'PlayHT2.0-turbo',
+        voice_engine,
        options: JSON.stringify({
-          quality: "medium",
+          quality: 'medium',
          speed: 1,
          seed: 1,
          temperature: 1,
-          emotion: "female_happy",
+          emotion: 'female_happy',
          voice_guidance: 3,
          style_guidance: 20,
          text_guidance: 1,
        })
      },
-      language: 'en-US',
+      language: 'english',
      voice: 's3://voice-cloning-zero-shot/d9ff78ba-d016-47f6-b0ef-dd630f59414e/female-cs/manifest.json',
      text,
      renderForCaching: true
@@ -615,6 +615,14 @@ test('PlayHT speech synth tests', async(t) => {
    t.end(err);
  }
  client.quit();
+};
+
+test('PlayHT speech synth tests', async(t) => {
+  await testPlayHT(t, 'PlayHT2.0-turbo');
+});
+
+test('PlayHT3.0 speech synth tests', async(t) => {
+  await testPlayHT(t, 'Play3.0');
 });

 test('rimelabs speech synth tests', async(t) => {