support gemini tts

2026-01-25 02:08:26 +00:00 · 2026-01-11 07:30:18 +07:00
parent 5f7e7458bb
commit 0ea7082da2
3 changed files with 1137 additions and 363 deletions
--- a/lib/synth-audio.js
+++ b/lib/synth-audio.js
@@ -204,7 +204,9 @@ async function synthAudio(client, createHash, retrieveHash, logger, stats, { acc
    const startAt = process.hrtime();
    switch (vendor) {
      case 'google':
-        audioData = await synthGoogle(logger, {credentials, stats, language, voice, gender, key, text});
+        audioData = await synthGoogle(logger, {
          credentials, stats, language, voice, gender, key, text, model, options, instructions
        });
        break;
      case 'aws':
      case 'polly':
@@ -409,72 +411,86 @@ const synthPolly = async(createHash, retrieveHash, logger,
  }
 };
-const synthGoogle = async(logger, {credentials, stats, language, voice, gender, text}) => {
+
 const synthGoogle = async(logger, {
  credentials, stats, language, voice, gender, text, model, options, instructions
 }) => {
  const client = new ttsGoogle.TextToSpeechClient(credentials);
  // If google custom voice cloning is used.
  // At this time 31 Oct 2024, google node sdk has not support voice cloning yet.
  if (typeof voice === 'object' && voice.voice_cloning_key) {
    try {
      const accessToken = await client.auth.getAccessToken();
      const projectId = await client.getProjectId();
-      const post = bent('https://texttospeech.googleapis.com', 'POST', 'json', {
+  const isGemini = credentials.use_gemini_tts;
-        'Authorization': `Bearer ${accessToken}`,
+  const isVoiceCloning = typeof voice === 'object' && voice.voice_cloning_key;
        'x-goog-user-project': projectId,
        'Content-Type': 'application/json; charset=utf-8'
      });
-      const payload = {
+  // Build input based on voice type
-        input: {
+  let input;
-          text
+  if (isGemini) {
-        },
+    // Gemini TTS does not support SSML - strip tags if present
-        voice: {
+    let inputText = text;
-          language_code: language,
+    if (text.startsWith('<speak>')) {
-          voice_clone: {
+      inputText = text.replace(/<[^>]*>/g, '').trim();
-            voice_cloning_key: voice.voice_cloning_key
+      logger.info('synthGoogle: Gemini TTS does not support SSML, stripped tags from input');
          }
        },
        audioConfig: {
          // Cloning voice at this time still in v1 beta version, and it support LINEAR16 in Wav format, 24.000Hz
          audioEncoding: 'LINEAR16',
          sample_rate_hertz: 24000
        }
      };
      const wav = await post('/v1beta1/text:synthesize', payload);
      return {
        audioContent: Buffer.from(wav.audioContent, 'base64'),
        extension: 'wav',
        sampleRate: 24000
      };
    } catch (err) {
      logger.info({err: await err.text()}, 'synthGoogle returned error');
      throw err;
    }
    // Use instructions as prompt for Gemini TTS style control, options.prompt can override
    const prompt = options?.prompt || instructions;
    input = {
      text: inputText,
      ...(prompt && { prompt })
    };
  } else {
    input = text.startsWith('<speak>') ? { ssml: text } : { text };
  }
-  const opts = {
+  // Build voice selection params based on voice type
-    voice: {
+  let voiceParams;
-      ...(typeof voice === 'string' && {name: voice}),
+  if (isGemini) {
-      ...(typeof voice === 'object' && {customVoice: voice}),
+    voiceParams = {
      languageCode: language || 'en-US',
      name: voice,
      modelName: model
    };
  } else if (isVoiceCloning) {
    voiceParams = {
      languageCode: language,
      voiceClone: {
        voiceCloningKey: voice.voice_cloning_key
      }
    };
  } else {
    voiceParams = {
      ...(typeof voice === 'string' && { name: voice }),
      ...(typeof voice === 'object' && { customVoice: voice }),
      languageCode: language,
      ssmlGender: gender || 'SSML_VOICE_GENDER_UNSPECIFIED'
-    },
+    };
-    audioConfig: {audioEncoding: 'MP3'}
+  }
-  };
+
-  Object.assign(opts, {input: text.startsWith('<speak>') ? {ssml: text} : {text}});
+  // Build audio config based on voice type
  let audioConfig;
  let extension;
  let sampleRate;
  if (isGemini || isVoiceCloning) {
    audioConfig = { audioEncoding: 'LINEAR16', sampleRateHertz: 24000 };
    extension = 'r24';
    sampleRate = 24000;
  } else {
    audioConfig = { audioEncoding: 'MP3' };
    extension = 'mp3';
    sampleRate = 8000;
  }
  const opts = { input, voice: voiceParams, audioConfig };
  try {
-    const responses = await client.synthesizeSpeech(opts);
+    logger.debug({ opts }, 'synthGoogle: request');
    const [response] = await client.synthesizeSpeech(opts);
    stats.increment('tts.count', ['vendor:google', 'accepted:yes']);
    client.close();
    return {
-      audioContent: responses[0].audioContent,
+      audioContent: response.audioContent,
-      extension: 'mp3',
+      extension,
-      sampleRate: 8000
+      sampleRate
    };
  } catch (err) {
-    console.error(err);
+    logger.info({ err, opts }, 'synthAudio: Error synthesizing speech using google');
    logger.info({err, opts}, 'synthAudio: Error synthesizing speech using google');
    stats.increment('tts.count', ['vendor:google', 'accepted:no']);
    client && client.close();
    throw err;
--- a/package-lock.json
+++ b/package-lock.json
--- a/package.json
+++ b/package.json
@@ -30,7 +30,7 @@
    "@aws-sdk/client-polly": "^3.496.0",
    "@aws-sdk/client-sts": "^3.496.0",
    "@cartesia/cartesia-js": "^2.2.7",
-    "@google-cloud/text-to-speech": "^5.5.0",
+    "@google-cloud/text-to-speech": "^6.4.0",
    "@grpc/grpc-js": "^1.9.14",
    "@jambonz/realtimedb-helpers": "^0.8.7",
    "bent": "^7.3.12",