mirror of
https://github.com/jambonz/speech-utils.git
synced 2026-01-25 02:08:26 +00:00
support gemini tts
This commit is contained in:
@@ -204,7 +204,9 @@ async function synthAudio(client, createHash, retrieveHash, logger, stats, { acc
|
||||
const startAt = process.hrtime();
|
||||
switch (vendor) {
|
||||
case 'google':
|
||||
audioData = await synthGoogle(logger, {credentials, stats, language, voice, gender, key, text});
|
||||
audioData = await synthGoogle(logger, {
|
||||
credentials, stats, language, voice, gender, key, text, model, options, instructions
|
||||
});
|
||||
break;
|
||||
case 'aws':
|
||||
case 'polly':
|
||||
@@ -409,72 +411,86 @@ const synthPolly = async(createHash, retrieveHash, logger,
|
||||
}
|
||||
};
|
||||
|
||||
const synthGoogle = async(logger, {credentials, stats, language, voice, gender, text}) => {
|
||||
|
||||
const synthGoogle = async(logger, {
|
||||
credentials, stats, language, voice, gender, text, model, options, instructions
|
||||
}) => {
|
||||
const client = new ttsGoogle.TextToSpeechClient(credentials);
|
||||
// If google custom voice cloning is used.
|
||||
// At this time 31 Oct 2024, google node sdk has not support voice cloning yet.
|
||||
if (typeof voice === 'object' && voice.voice_cloning_key) {
|
||||
try {
|
||||
const accessToken = await client.auth.getAccessToken();
|
||||
const projectId = await client.getProjectId();
|
||||
|
||||
const post = bent('https://texttospeech.googleapis.com', 'POST', 'json', {
|
||||
'Authorization': `Bearer ${accessToken}`,
|
||||
'x-goog-user-project': projectId,
|
||||
'Content-Type': 'application/json; charset=utf-8'
|
||||
});
|
||||
const isGemini = credentials.use_gemini_tts;
|
||||
const isVoiceCloning = typeof voice === 'object' && voice.voice_cloning_key;
|
||||
|
||||
const payload = {
|
||||
input: {
|
||||
text
|
||||
},
|
||||
voice: {
|
||||
language_code: language,
|
||||
voice_clone: {
|
||||
voice_cloning_key: voice.voice_cloning_key
|
||||
}
|
||||
},
|
||||
audioConfig: {
|
||||
// Cloning voice at this time still in v1 beta version, and it support LINEAR16 in Wav format, 24.000Hz
|
||||
audioEncoding: 'LINEAR16',
|
||||
sample_rate_hertz: 24000
|
||||
}
|
||||
};
|
||||
|
||||
const wav = await post('/v1beta1/text:synthesize', payload);
|
||||
return {
|
||||
audioContent: Buffer.from(wav.audioContent, 'base64'),
|
||||
extension: 'wav',
|
||||
sampleRate: 24000
|
||||
};
|
||||
} catch (err) {
|
||||
logger.info({err: await err.text()}, 'synthGoogle returned error');
|
||||
throw err;
|
||||
// Build input based on voice type
|
||||
let input;
|
||||
if (isGemini) {
|
||||
// Gemini TTS does not support SSML - strip tags if present
|
||||
let inputText = text;
|
||||
if (text.startsWith('<speak>')) {
|
||||
inputText = text.replace(/<[^>]*>/g, '').trim();
|
||||
logger.info('synthGoogle: Gemini TTS does not support SSML, stripped tags from input');
|
||||
}
|
||||
// Use instructions as prompt for Gemini TTS style control, options.prompt can override
|
||||
const prompt = options?.prompt || instructions;
|
||||
input = {
|
||||
text: inputText,
|
||||
...(prompt && { prompt })
|
||||
};
|
||||
} else {
|
||||
input = text.startsWith('<speak>') ? { ssml: text } : { text };
|
||||
}
|
||||
|
||||
const opts = {
|
||||
voice: {
|
||||
...(typeof voice === 'string' && {name: voice}),
|
||||
...(typeof voice === 'object' && {customVoice: voice}),
|
||||
// Build voice selection params based on voice type
|
||||
let voiceParams;
|
||||
if (isGemini) {
|
||||
voiceParams = {
|
||||
languageCode: language || 'en-US',
|
||||
name: voice,
|
||||
modelName: model
|
||||
};
|
||||
} else if (isVoiceCloning) {
|
||||
voiceParams = {
|
||||
languageCode: language,
|
||||
voiceClone: {
|
||||
voiceCloningKey: voice.voice_cloning_key
|
||||
}
|
||||
};
|
||||
} else {
|
||||
voiceParams = {
|
||||
...(typeof voice === 'string' && { name: voice }),
|
||||
...(typeof voice === 'object' && { customVoice: voice }),
|
||||
languageCode: language,
|
||||
ssmlGender: gender || 'SSML_VOICE_GENDER_UNSPECIFIED'
|
||||
},
|
||||
audioConfig: {audioEncoding: 'MP3'}
|
||||
};
|
||||
Object.assign(opts, {input: text.startsWith('<speak>') ? {ssml: text} : {text}});
|
||||
};
|
||||
}
|
||||
|
||||
// Build audio config based on voice type
|
||||
let audioConfig;
|
||||
let extension;
|
||||
let sampleRate;
|
||||
if (isGemini || isVoiceCloning) {
|
||||
audioConfig = { audioEncoding: 'LINEAR16', sampleRateHertz: 24000 };
|
||||
extension = 'r24';
|
||||
sampleRate = 24000;
|
||||
} else {
|
||||
audioConfig = { audioEncoding: 'MP3' };
|
||||
extension = 'mp3';
|
||||
sampleRate = 8000;
|
||||
}
|
||||
|
||||
const opts = { input, voice: voiceParams, audioConfig };
|
||||
|
||||
try {
|
||||
const responses = await client.synthesizeSpeech(opts);
|
||||
logger.debug({ opts }, 'synthGoogle: request');
|
||||
const [response] = await client.synthesizeSpeech(opts);
|
||||
stats.increment('tts.count', ['vendor:google', 'accepted:yes']);
|
||||
client.close();
|
||||
return {
|
||||
audioContent: responses[0].audioContent,
|
||||
extension: 'mp3',
|
||||
sampleRate: 8000
|
||||
audioContent: response.audioContent,
|
||||
extension,
|
||||
sampleRate
|
||||
};
|
||||
} catch (err) {
|
||||
console.error(err);
|
||||
logger.info({err, opts}, 'synthAudio: Error synthesizing speech using google');
|
||||
logger.info({ err, opts }, 'synthAudio: Error synthesizing speech using google');
|
||||
stats.increment('tts.count', ['vendor:google', 'accepted:no']);
|
||||
client && client.close();
|
||||
throw err;
|
||||
|
||||
Reference in New Issue
Block a user