support gemini tts

This commit is contained in:
Hoan HL
2026-01-11 07:30:18 +07:00
parent 5f7e7458bb
commit 0ea7082da2
3 changed files with 1137 additions and 363 deletions

View File

@@ -204,7 +204,9 @@ async function synthAudio(client, createHash, retrieveHash, logger, stats, { acc
const startAt = process.hrtime(); const startAt = process.hrtime();
switch (vendor) { switch (vendor) {
case 'google': case 'google':
audioData = await synthGoogle(logger, {credentials, stats, language, voice, gender, key, text}); audioData = await synthGoogle(logger, {
credentials, stats, language, voice, gender, key, text, model, options, instructions
});
break; break;
case 'aws': case 'aws':
case 'polly': case 'polly':
@@ -409,72 +411,86 @@ const synthPolly = async(createHash, retrieveHash, logger,
} }
}; };
const synthGoogle = async(logger, {credentials, stats, language, voice, gender, text}) => {
const synthGoogle = async(logger, {
credentials, stats, language, voice, gender, text, model, options, instructions
}) => {
const client = new ttsGoogle.TextToSpeechClient(credentials); const client = new ttsGoogle.TextToSpeechClient(credentials);
// If google custom voice cloning is used.
// At this time 31 Oct 2024, google node sdk has not support voice cloning yet.
if (typeof voice === 'object' && voice.voice_cloning_key) {
try {
const accessToken = await client.auth.getAccessToken();
const projectId = await client.getProjectId();
const post = bent('https://texttospeech.googleapis.com', 'POST', 'json', { const isGemini = credentials.use_gemini_tts;
'Authorization': `Bearer ${accessToken}`, const isVoiceCloning = typeof voice === 'object' && voice.voice_cloning_key;
'x-goog-user-project': projectId,
'Content-Type': 'application/json; charset=utf-8'
});
const payload = { // Build input based on voice type
input: { let input;
text if (isGemini) {
}, // Gemini TTS does not support SSML - strip tags if present
voice: { let inputText = text;
language_code: language, if (text.startsWith('<speak>')) {
voice_clone: { inputText = text.replace(/<[^>]*>/g, '').trim();
voice_cloning_key: voice.voice_cloning_key logger.info('synthGoogle: Gemini TTS does not support SSML, stripped tags from input');
}
},
audioConfig: {
// Cloning voice at this time still in v1 beta version, and it support LINEAR16 in Wav format, 24.000Hz
audioEncoding: 'LINEAR16',
sample_rate_hertz: 24000
}
};
const wav = await post('/v1beta1/text:synthesize', payload);
return {
audioContent: Buffer.from(wav.audioContent, 'base64'),
extension: 'wav',
sampleRate: 24000
};
} catch (err) {
logger.info({err: await err.text()}, 'synthGoogle returned error');
throw err;
} }
// Use instructions as prompt for Gemini TTS style control, options.prompt can override
const prompt = options?.prompt || instructions;
input = {
text: inputText,
...(prompt && { prompt })
};
} else {
input = text.startsWith('<speak>') ? { ssml: text } : { text };
} }
const opts = { // Build voice selection params based on voice type
voice: { let voiceParams;
...(typeof voice === 'string' && {name: voice}), if (isGemini) {
...(typeof voice === 'object' && {customVoice: voice}), voiceParams = {
languageCode: language || 'en-US',
name: voice,
modelName: model
};
} else if (isVoiceCloning) {
voiceParams = {
languageCode: language,
voiceClone: {
voiceCloningKey: voice.voice_cloning_key
}
};
} else {
voiceParams = {
...(typeof voice === 'string' && { name: voice }),
...(typeof voice === 'object' && { customVoice: voice }),
languageCode: language, languageCode: language,
ssmlGender: gender || 'SSML_VOICE_GENDER_UNSPECIFIED' ssmlGender: gender || 'SSML_VOICE_GENDER_UNSPECIFIED'
}, };
audioConfig: {audioEncoding: 'MP3'} }
};
Object.assign(opts, {input: text.startsWith('<speak>') ? {ssml: text} : {text}}); // Build audio config based on voice type
let audioConfig;
let extension;
let sampleRate;
if (isGemini || isVoiceCloning) {
audioConfig = { audioEncoding: 'LINEAR16', sampleRateHertz: 24000 };
extension = 'r24';
sampleRate = 24000;
} else {
audioConfig = { audioEncoding: 'MP3' };
extension = 'mp3';
sampleRate = 8000;
}
const opts = { input, voice: voiceParams, audioConfig };
try { try {
const responses = await client.synthesizeSpeech(opts); logger.debug({ opts }, 'synthGoogle: request');
const [response] = await client.synthesizeSpeech(opts);
stats.increment('tts.count', ['vendor:google', 'accepted:yes']); stats.increment('tts.count', ['vendor:google', 'accepted:yes']);
client.close(); client.close();
return { return {
audioContent: responses[0].audioContent, audioContent: response.audioContent,
extension: 'mp3', extension,
sampleRate: 8000 sampleRate
}; };
} catch (err) { } catch (err) {
console.error(err); logger.info({ err, opts }, 'synthAudio: Error synthesizing speech using google');
logger.info({err, opts}, 'synthAudio: Error synthesizing speech using google');
stats.increment('tts.count', ['vendor:google', 'accepted:no']); stats.increment('tts.count', ['vendor:google', 'accepted:no']);
client && client.close(); client && client.close();
throw err; throw err;

1376
package-lock.json generated

File diff suppressed because it is too large Load Diff

View File

@@ -30,7 +30,7 @@
"@aws-sdk/client-polly": "^3.496.0", "@aws-sdk/client-polly": "^3.496.0",
"@aws-sdk/client-sts": "^3.496.0", "@aws-sdk/client-sts": "^3.496.0",
"@cartesia/cartesia-js": "^2.2.7", "@cartesia/cartesia-js": "^2.2.7",
"@google-cloud/text-to-speech": "^5.5.0", "@google-cloud/text-to-speech": "^6.4.0",
"@grpc/grpc-js": "^1.9.14", "@grpc/grpc-js": "^1.9.14",
"@jambonz/realtimedb-helpers": "^0.8.7", "@jambonz/realtimedb-helpers": "^0.8.7",
"bent": "^7.3.12", "bent": "^7.3.12",