mirror of
https://github.com/jambonz/speech-utils.git
synced 2026-01-25 02:08:26 +00:00
support gemini tts
This commit is contained in:
@@ -204,7 +204,9 @@ async function synthAudio(client, createHash, retrieveHash, logger, stats, { acc
|
|||||||
const startAt = process.hrtime();
|
const startAt = process.hrtime();
|
||||||
switch (vendor) {
|
switch (vendor) {
|
||||||
case 'google':
|
case 'google':
|
||||||
audioData = await synthGoogle(logger, {credentials, stats, language, voice, gender, key, text});
|
audioData = await synthGoogle(logger, {
|
||||||
|
credentials, stats, language, voice, gender, key, text, model, options, instructions
|
||||||
|
});
|
||||||
break;
|
break;
|
||||||
case 'aws':
|
case 'aws':
|
||||||
case 'polly':
|
case 'polly':
|
||||||
@@ -409,72 +411,86 @@ const synthPolly = async(createHash, retrieveHash, logger,
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
const synthGoogle = async(logger, {credentials, stats, language, voice, gender, text}) => {
|
|
||||||
|
const synthGoogle = async(logger, {
|
||||||
|
credentials, stats, language, voice, gender, text, model, options, instructions
|
||||||
|
}) => {
|
||||||
const client = new ttsGoogle.TextToSpeechClient(credentials);
|
const client = new ttsGoogle.TextToSpeechClient(credentials);
|
||||||
// If google custom voice cloning is used.
|
|
||||||
// At this time 31 Oct 2024, google node sdk has not support voice cloning yet.
|
|
||||||
if (typeof voice === 'object' && voice.voice_cloning_key) {
|
|
||||||
try {
|
|
||||||
const accessToken = await client.auth.getAccessToken();
|
|
||||||
const projectId = await client.getProjectId();
|
|
||||||
|
|
||||||
const post = bent('https://texttospeech.googleapis.com', 'POST', 'json', {
|
const isGemini = credentials.use_gemini_tts;
|
||||||
'Authorization': `Bearer ${accessToken}`,
|
const isVoiceCloning = typeof voice === 'object' && voice.voice_cloning_key;
|
||||||
'x-goog-user-project': projectId,
|
|
||||||
'Content-Type': 'application/json; charset=utf-8'
|
|
||||||
});
|
|
||||||
|
|
||||||
const payload = {
|
// Build input based on voice type
|
||||||
input: {
|
let input;
|
||||||
text
|
if (isGemini) {
|
||||||
},
|
// Gemini TTS does not support SSML - strip tags if present
|
||||||
voice: {
|
let inputText = text;
|
||||||
language_code: language,
|
if (text.startsWith('<speak>')) {
|
||||||
voice_clone: {
|
inputText = text.replace(/<[^>]*>/g, '').trim();
|
||||||
voice_cloning_key: voice.voice_cloning_key
|
logger.info('synthGoogle: Gemini TTS does not support SSML, stripped tags from input');
|
||||||
}
|
|
||||||
},
|
|
||||||
audioConfig: {
|
|
||||||
// Cloning voice at this time still in v1 beta version, and it support LINEAR16 in Wav format, 24.000Hz
|
|
||||||
audioEncoding: 'LINEAR16',
|
|
||||||
sample_rate_hertz: 24000
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
const wav = await post('/v1beta1/text:synthesize', payload);
|
|
||||||
return {
|
|
||||||
audioContent: Buffer.from(wav.audioContent, 'base64'),
|
|
||||||
extension: 'wav',
|
|
||||||
sampleRate: 24000
|
|
||||||
};
|
|
||||||
} catch (err) {
|
|
||||||
logger.info({err: await err.text()}, 'synthGoogle returned error');
|
|
||||||
throw err;
|
|
||||||
}
|
}
|
||||||
|
// Use instructions as prompt for Gemini TTS style control, options.prompt can override
|
||||||
|
const prompt = options?.prompt || instructions;
|
||||||
|
input = {
|
||||||
|
text: inputText,
|
||||||
|
...(prompt && { prompt })
|
||||||
|
};
|
||||||
|
} else {
|
||||||
|
input = text.startsWith('<speak>') ? { ssml: text } : { text };
|
||||||
}
|
}
|
||||||
|
|
||||||
const opts = {
|
// Build voice selection params based on voice type
|
||||||
voice: {
|
let voiceParams;
|
||||||
...(typeof voice === 'string' && {name: voice}),
|
if (isGemini) {
|
||||||
...(typeof voice === 'object' && {customVoice: voice}),
|
voiceParams = {
|
||||||
|
languageCode: language || 'en-US',
|
||||||
|
name: voice,
|
||||||
|
modelName: model
|
||||||
|
};
|
||||||
|
} else if (isVoiceCloning) {
|
||||||
|
voiceParams = {
|
||||||
|
languageCode: language,
|
||||||
|
voiceClone: {
|
||||||
|
voiceCloningKey: voice.voice_cloning_key
|
||||||
|
}
|
||||||
|
};
|
||||||
|
} else {
|
||||||
|
voiceParams = {
|
||||||
|
...(typeof voice === 'string' && { name: voice }),
|
||||||
|
...(typeof voice === 'object' && { customVoice: voice }),
|
||||||
languageCode: language,
|
languageCode: language,
|
||||||
ssmlGender: gender || 'SSML_VOICE_GENDER_UNSPECIFIED'
|
ssmlGender: gender || 'SSML_VOICE_GENDER_UNSPECIFIED'
|
||||||
},
|
};
|
||||||
audioConfig: {audioEncoding: 'MP3'}
|
}
|
||||||
};
|
|
||||||
Object.assign(opts, {input: text.startsWith('<speak>') ? {ssml: text} : {text}});
|
// Build audio config based on voice type
|
||||||
|
let audioConfig;
|
||||||
|
let extension;
|
||||||
|
let sampleRate;
|
||||||
|
if (isGemini || isVoiceCloning) {
|
||||||
|
audioConfig = { audioEncoding: 'LINEAR16', sampleRateHertz: 24000 };
|
||||||
|
extension = 'r24';
|
||||||
|
sampleRate = 24000;
|
||||||
|
} else {
|
||||||
|
audioConfig = { audioEncoding: 'MP3' };
|
||||||
|
extension = 'mp3';
|
||||||
|
sampleRate = 8000;
|
||||||
|
}
|
||||||
|
|
||||||
|
const opts = { input, voice: voiceParams, audioConfig };
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const responses = await client.synthesizeSpeech(opts);
|
logger.debug({ opts }, 'synthGoogle: request');
|
||||||
|
const [response] = await client.synthesizeSpeech(opts);
|
||||||
stats.increment('tts.count', ['vendor:google', 'accepted:yes']);
|
stats.increment('tts.count', ['vendor:google', 'accepted:yes']);
|
||||||
client.close();
|
client.close();
|
||||||
return {
|
return {
|
||||||
audioContent: responses[0].audioContent,
|
audioContent: response.audioContent,
|
||||||
extension: 'mp3',
|
extension,
|
||||||
sampleRate: 8000
|
sampleRate
|
||||||
};
|
};
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
console.error(err);
|
logger.info({ err, opts }, 'synthAudio: Error synthesizing speech using google');
|
||||||
logger.info({err, opts}, 'synthAudio: Error synthesizing speech using google');
|
|
||||||
stats.increment('tts.count', ['vendor:google', 'accepted:no']);
|
stats.increment('tts.count', ['vendor:google', 'accepted:no']);
|
||||||
client && client.close();
|
client && client.close();
|
||||||
throw err;
|
throw err;
|
||||||
|
|||||||
1376
package-lock.json
generated
1376
package-lock.json
generated
File diff suppressed because it is too large
Load Diff
@@ -30,7 +30,7 @@
|
|||||||
"@aws-sdk/client-polly": "^3.496.0",
|
"@aws-sdk/client-polly": "^3.496.0",
|
||||||
"@aws-sdk/client-sts": "^3.496.0",
|
"@aws-sdk/client-sts": "^3.496.0",
|
||||||
"@cartesia/cartesia-js": "^2.2.7",
|
"@cartesia/cartesia-js": "^2.2.7",
|
||||||
"@google-cloud/text-to-speech": "^5.5.0",
|
"@google-cloud/text-to-speech": "^6.4.0",
|
||||||
"@grpc/grpc-js": "^1.9.14",
|
"@grpc/grpc-js": "^1.9.14",
|
||||||
"@jambonz/realtimedb-helpers": "^0.8.7",
|
"@jambonz/realtimedb-helpers": "^0.8.7",
|
||||||
"bent": "^7.3.12",
|
"bent": "^7.3.12",
|
||||||
|
|||||||
Reference in New Issue
Block a user