support gemini tts

2026-01-25 02:08:26 +00:00 · 2026-01-11 07:30:18 +07:00
parent 5f7e7458bb
commit 0ea7082da2
3 changed files with 1137 additions and 363 deletions
--- a/lib/synth-audio.js
+++ b/lib/synth-audio.js
@@ -204,7 +204,9 @@ async function synthAudio(client, createHash, retrieveHash, logger, stats, { acc
    const startAt = process.hrtime();
    switch (vendor) {
      case 'google':
-        audioData = await synthGoogle(logger, {credentials, stats, language, voice, gender, key, text});
+        audioData = await synthGoogle(logger, {
+          credentials, stats, language, voice, gender, key, text, model, options, instructions
+        });
        break;
      case 'aws':
      case 'polly':
@@ -409,72 +411,86 @@ const synthPolly = async(createHash, retrieveHash, logger,
  }
 };

-const synthGoogle = async(logger, {credentials, stats, language, voice, gender, text}) => {
+
+const synthGoogle = async(logger, {
+  credentials, stats, language, voice, gender, text, model, options, instructions
+}) => {
  const client = new ttsGoogle.TextToSpeechClient(credentials);
-  // If google custom voice cloning is used.
-  // At this time 31 Oct 2024, google node sdk has not support voice cloning yet.
-  if (typeof voice === 'object' && voice.voice_cloning_key) {
-    try {
-      const accessToken = await client.auth.getAccessToken();
-      const projectId = await client.getProjectId();

-      const post = bent('https://texttospeech.googleapis.com', 'POST', 'json', {
-        'Authorization': `Bearer ${accessToken}`,
-        'x-goog-user-project': projectId,
-        'Content-Type': 'application/json; charset=utf-8'
-      });
+  const isGemini = credentials.use_gemini_tts;
+  const isVoiceCloning = typeof voice === 'object' && voice.voice_cloning_key;

-      const payload = {
-        input: {
-          text
-        },
-        voice: {
-          language_code: language,
-          voice_clone: {
-            voice_cloning_key: voice.voice_cloning_key
-          }
-        },
-        audioConfig: {
-          // Cloning voice at this time still in v1 beta version, and it support LINEAR16 in Wav format, 24.000Hz
-          audioEncoding: 'LINEAR16',
-          sample_rate_hertz: 24000
-        }
-      };
-
-      const wav = await post('/v1beta1/text:synthesize', payload);
-      return {
-        audioContent: Buffer.from(wav.audioContent, 'base64'),
-        extension: 'wav',
-        sampleRate: 24000
-      };
-    } catch (err) {
-      logger.info({err: await err.text()}, 'synthGoogle returned error');
-      throw err;
+  // Build input based on voice type
+  let input;
+  if (isGemini) {
+    // Gemini TTS does not support SSML - strip tags if present
+    let inputText = text;
+    if (text.startsWith('<speak>')) {
+      inputText = text.replace(/<[^>]*>/g, '').trim();
+      logger.info('synthGoogle: Gemini TTS does not support SSML, stripped tags from input');
    }
+    // Use instructions as prompt for Gemini TTS style control, options.prompt can override
+    const prompt = options?.prompt || instructions;
+    input = {
+      text: inputText,
+      ...(prompt && { prompt })
+    };
+  } else {
+    input = text.startsWith('<speak>') ? { ssml: text } : { text };
  }

-  const opts = {
-    voice: {
-      ...(typeof voice === 'string' && {name: voice}),
-      ...(typeof voice === 'object' && {customVoice: voice}),
+  // Build voice selection params based on voice type
+  let voiceParams;
+  if (isGemini) {
+    voiceParams = {
+      languageCode: language || 'en-US',
+      name: voice,
+      modelName: model
+    };
+  } else if (isVoiceCloning) {
+    voiceParams = {
+      languageCode: language,
+      voiceClone: {
+        voiceCloningKey: voice.voice_cloning_key
+      }
+    };
+  } else {
+    voiceParams = {
+      ...(typeof voice === 'string' && { name: voice }),
+      ...(typeof voice === 'object' && { customVoice: voice }),
      languageCode: language,
      ssmlGender: gender || 'SSML_VOICE_GENDER_UNSPECIFIED'
-    },
-    audioConfig: {audioEncoding: 'MP3'}
-  };
-  Object.assign(opts, {input: text.startsWith('<speak>') ? {ssml: text} : {text}});
+    };
+  }
+
+  // Build audio config based on voice type
+  let audioConfig;
+  let extension;
+  let sampleRate;
+  if (isGemini || isVoiceCloning) {
+    audioConfig = { audioEncoding: 'LINEAR16', sampleRateHertz: 24000 };
+    extension = 'r24';
+    sampleRate = 24000;
+  } else {
+    audioConfig = { audioEncoding: 'MP3' };
+    extension = 'mp3';
+    sampleRate = 8000;
+  }
+
+  const opts = { input, voice: voiceParams, audioConfig };
+
  try {
-    const responses = await client.synthesizeSpeech(opts);
+    logger.debug({ opts }, 'synthGoogle: request');
+    const [response] = await client.synthesizeSpeech(opts);
    stats.increment('tts.count', ['vendor:google', 'accepted:yes']);
    client.close();
    return {
-      audioContent: responses[0].audioContent,
-      extension: 'mp3',
-      sampleRate: 8000
+      audioContent: response.audioContent,
+      extension,
+      sampleRate
    };
  } catch (err) {
-    console.error(err);
-    logger.info({err, opts}, 'synthAudio: Error synthesizing speech using google');
+    logger.info({ err, opts }, 'synthAudio: Error synthesizing speech using google');
    stats.increment('tts.count', ['vendor:google', 'accepted:no']);
    client && client.close();
    throw err;