support google voice cloning

2025-12-19 03:37:49 +00:00 · 2024-10-31 20:23:11 +07:00
parent f183852961
commit 115faa9f89
5 changed files with 110 additions and 18 deletions
--- a/lib/synth-audio.js
+++ b/lib/synth-audio.js
@@ -170,7 +170,7 @@ async function synthAudio(client, createHash, retrieveHash, logger, stats, { acc
    renderForCaching
  });
  let filePath;
-  filePath = makeFilePath({vendor, key, salt, renderForCaching});
+  filePath = makeFilePath({vendor, voice, key, salt, renderForCaching});
  debug(`synth key is ${key}`);
  let cached;
  if (!disableTtsCache) {
@@ -192,7 +192,7 @@ async function synthAudio(client, createHash, retrieveHash, logger, stats, { acc
      cached = await client.get(preCachekey);
      if (cached) {
        // Precache audio is available update filpath with precache file extension.
-        filePath = makeFilePath({vendor, key, salt, renderForCaching: true});
+        filePath = makeFilePath({vendor, voice, key, salt, renderForCaching: true});
      }
    }
  }
@@ -353,6 +353,44 @@ const synthPolly = async(createHash, retrieveHash, logger,

 const synthGoogle = async(logger, {credentials, stats, language, voice, gender, text}) => {
  const client = new ttsGoogle.TextToSpeechClient(credentials);
+  // If google custom voice cloning is used.
+  // At this time 31 Oct 2024, google node sdk has not support cloning voice yet.
+  if (typeof voice === 'object' && voice.voice_cloning_key) {
+    try {
+      const accessToken = await client.auth.getAccessToken();
+      const projectId = await client.getProjectId();
+
+      const post = bent('https://texttospeech.googleapis.com', 'POST', 'json', {
+        'Authorization': `Bearer ${accessToken}`,
+        'x-goog-user-project': projectId,
+        'Content-Type': 'application/json; charset=utf-8'
+      });
+
+      const payload = {
+        input: {
+          text
+        },
+        voice: {
+          language_code: language,
+          voice_clone: {
+            voice_cloning_key: voice.voice_cloning_key
+          }
+        },
+        audioConfig: {
+          // Cloning voice at this time still in v1 beta version, and it support LINEAR16 in Wav format, 24.000Hz
+          audioEncoding: 'LINEAR16',
+          sample_rate_hertz: 24000
+        }
+      };
+
+      const mp3 = await post('/v1beta1/text:synthesize', payload);
+      return Buffer.from(mp3.audioContent, 'base64');
+    } catch (err) {
+      logger.info({err: await err.text()}, 'synthGoogle returned error');
+      throw err;
+    }
+  }
+
  const opts = {
    voice: {
      ...(typeof voice === 'string' && {name: voice}),
--- a/lib/utils.js
+++ b/lib/utils.js
@@ -23,19 +23,20 @@ function makeSynthKey({
  hash.update(`${language}:${vendor}:${voice}:${engine}:${text}`);
  const hexHashKey = hash.digest('hex');
  const accountKey = account_sid ? `:${account_sid}` : '';
-  const namespace = vendor.startsWith('custom') ? vendor : getFileExtension({vendor, renderForCaching});
+  const namespace = vendor.startsWith('custom') ? vendor : getFileExtension({vendor, voice, renderForCaching});
  const key = `tts${accountKey}:${namespace}:${hexHashKey}`;
  return key;
 }

-function makeFilePath({vendor, key, salt = '', renderForCaching = false}) {
-  const extension = getFileExtension({vendor, renderForCaching});
+function makeFilePath({vendor, voice, key, salt = '', renderForCaching = false}) {
+  const extension = getFileExtension({vendor, renderForCaching, voice});
  return `${TMP_FOLDER}/${key.replace('tts:', `tts-${salt}`)}.${extension}`;
 }

-function getFileExtension({vendor, renderForCaching = false}) {
+function getFileExtension({vendor, voice, renderForCaching = false}) {
  const mp3Extension = 'mp3';
  const r8Extension = 'r8';
+  const wavExtension = 'wav';

  switch (vendor) {
    case 'azure':
@@ -58,6 +59,13 @@ function getFileExtension({vendor, renderForCaching = false}) {
    case 'nvidia':
    case 'verbio':
      return r8Extension;
+    case 'google':
+      // google voice cloning just support wav.
+      if (typeof voice === 'object' && voice.voice_cloning_key) {
+        return wavExtension;
+      } else {
+        return mp3Extension;
+      }
    default:
      // If vendor is custom
      if (vendor.startsWith('custom')) {