mirror of
https://github.com/jambonz/speech-utils.git
synced 2025-12-19 03:37:49 +00:00
support google voice cloning
This commit is contained in:
@@ -170,7 +170,7 @@ async function synthAudio(client, createHash, retrieveHash, logger, stats, { acc
|
||||
renderForCaching
|
||||
});
|
||||
let filePath;
|
||||
filePath = makeFilePath({vendor, key, salt, renderForCaching});
|
||||
filePath = makeFilePath({vendor, voice, key, salt, renderForCaching});
|
||||
debug(`synth key is ${key}`);
|
||||
let cached;
|
||||
if (!disableTtsCache) {
|
||||
@@ -192,7 +192,7 @@ async function synthAudio(client, createHash, retrieveHash, logger, stats, { acc
|
||||
cached = await client.get(preCachekey);
|
||||
if (cached) {
|
||||
// Precache audio is available update filpath with precache file extension.
|
||||
filePath = makeFilePath({vendor, key, salt, renderForCaching: true});
|
||||
filePath = makeFilePath({vendor, voice, key, salt, renderForCaching: true});
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -353,6 +353,44 @@ const synthPolly = async(createHash, retrieveHash, logger,
|
||||
|
||||
const synthGoogle = async(logger, {credentials, stats, language, voice, gender, text}) => {
|
||||
const client = new ttsGoogle.TextToSpeechClient(credentials);
|
||||
// If google custom voice cloning is used.
|
||||
// At this time 31 Oct 2024, google node sdk has not support cloning voice yet.
|
||||
if (typeof voice === 'object' && voice.voice_cloning_key) {
|
||||
try {
|
||||
const accessToken = await client.auth.getAccessToken();
|
||||
const projectId = await client.getProjectId();
|
||||
|
||||
const post = bent('https://texttospeech.googleapis.com', 'POST', 'json', {
|
||||
'Authorization': `Bearer ${accessToken}`,
|
||||
'x-goog-user-project': projectId,
|
||||
'Content-Type': 'application/json; charset=utf-8'
|
||||
});
|
||||
|
||||
const payload = {
|
||||
input: {
|
||||
text
|
||||
},
|
||||
voice: {
|
||||
language_code: language,
|
||||
voice_clone: {
|
||||
voice_cloning_key: voice.voice_cloning_key
|
||||
}
|
||||
},
|
||||
audioConfig: {
|
||||
// Cloning voice at this time still in v1 beta version, and it support LINEAR16 in Wav format, 24.000Hz
|
||||
audioEncoding: 'LINEAR16',
|
||||
sample_rate_hertz: 24000
|
||||
}
|
||||
};
|
||||
|
||||
const mp3 = await post('/v1beta1/text:synthesize', payload);
|
||||
return Buffer.from(mp3.audioContent, 'base64');
|
||||
} catch (err) {
|
||||
logger.info({err: await err.text()}, 'synthGoogle returned error');
|
||||
throw err;
|
||||
}
|
||||
}
|
||||
|
||||
const opts = {
|
||||
voice: {
|
||||
...(typeof voice === 'string' && {name: voice}),
|
||||
|
||||
16
lib/utils.js
16
lib/utils.js
@@ -23,19 +23,20 @@ function makeSynthKey({
|
||||
hash.update(`${language}:${vendor}:${voice}:${engine}:${text}`);
|
||||
const hexHashKey = hash.digest('hex');
|
||||
const accountKey = account_sid ? `:${account_sid}` : '';
|
||||
const namespace = vendor.startsWith('custom') ? vendor : getFileExtension({vendor, renderForCaching});
|
||||
const namespace = vendor.startsWith('custom') ? vendor : getFileExtension({vendor, voice, renderForCaching});
|
||||
const key = `tts${accountKey}:${namespace}:${hexHashKey}`;
|
||||
return key;
|
||||
}
|
||||
|
||||
function makeFilePath({vendor, key, salt = '', renderForCaching = false}) {
|
||||
const extension = getFileExtension({vendor, renderForCaching});
|
||||
function makeFilePath({vendor, voice, key, salt = '', renderForCaching = false}) {
|
||||
const extension = getFileExtension({vendor, renderForCaching, voice});
|
||||
return `${TMP_FOLDER}/${key.replace('tts:', `tts-${salt}`)}.${extension}`;
|
||||
}
|
||||
|
||||
function getFileExtension({vendor, renderForCaching = false}) {
|
||||
function getFileExtension({vendor, voice, renderForCaching = false}) {
|
||||
const mp3Extension = 'mp3';
|
||||
const r8Extension = 'r8';
|
||||
const wavExtension = 'wav';
|
||||
|
||||
switch (vendor) {
|
||||
case 'azure':
|
||||
@@ -58,6 +59,13 @@ function getFileExtension({vendor, renderForCaching = false}) {
|
||||
case 'nvidia':
|
||||
case 'verbio':
|
||||
return r8Extension;
|
||||
case 'google':
|
||||
// google voice cloning just support wav.
|
||||
if (typeof voice === 'object' && voice.voice_cloning_key) {
|
||||
return wavExtension;
|
||||
} else {
|
||||
return mp3Extension;
|
||||
}
|
||||
default:
|
||||
// If vendor is custom
|
||||
if (vendor.startsWith('custom')) {
|
||||
|
||||
15
package-lock.json
generated
15
package-lock.json
generated
@@ -11,7 +11,7 @@
|
||||
"dependencies": {
|
||||
"@aws-sdk/client-polly": "^3.496.0",
|
||||
"@aws-sdk/client-sts": "^3.496.0",
|
||||
"@google-cloud/text-to-speech": "^5.0.2",
|
||||
"@google-cloud/text-to-speech": "^5.5.0",
|
||||
"@grpc/grpc-js": "^1.9.14",
|
||||
"@jambonz/realtimedb-helpers": "^0.8.7",
|
||||
"bent": "^7.3.12",
|
||||
@@ -1230,9 +1230,10 @@
|
||||
}
|
||||
},
|
||||
"node_modules/@google-cloud/text-to-speech": {
|
||||
"version": "5.0.2",
|
||||
"resolved": "https://registry.npmjs.org/@google-cloud/text-to-speech/-/text-to-speech-5.0.2.tgz",
|
||||
"integrity": "sha512-Q11Ddh9eHKSDA3E/KSqMITgVprXb0XgIKuJP9F5ScJ1T9h+DNrbgIU7shd0QOlPqb8ruQRiTOqL08+Mq5R89Ow==",
|
||||
"version": "5.5.0",
|
||||
"resolved": "https://registry.npmjs.org/@google-cloud/text-to-speech/-/text-to-speech-5.5.0.tgz",
|
||||
"integrity": "sha512-Cw/UK2Y3l31Vsuozu8cxsmVS/09fShimes0tRLgDbOY2ZMG1Dckb6Zf/Q3Nxg4X0feFep44pvwNmyHKrOnl9SQ==",
|
||||
"license": "Apache-2.0",
|
||||
"dependencies": {
|
||||
"google-gax": "^4.0.3"
|
||||
},
|
||||
@@ -8338,9 +8339,9 @@
|
||||
"dev": true
|
||||
},
|
||||
"@google-cloud/text-to-speech": {
|
||||
"version": "5.0.2",
|
||||
"resolved": "https://registry.npmjs.org/@google-cloud/text-to-speech/-/text-to-speech-5.0.2.tgz",
|
||||
"integrity": "sha512-Q11Ddh9eHKSDA3E/KSqMITgVprXb0XgIKuJP9F5ScJ1T9h+DNrbgIU7shd0QOlPqb8ruQRiTOqL08+Mq5R89Ow==",
|
||||
"version": "5.5.0",
|
||||
"resolved": "https://registry.npmjs.org/@google-cloud/text-to-speech/-/text-to-speech-5.5.0.tgz",
|
||||
"integrity": "sha512-Cw/UK2Y3l31Vsuozu8cxsmVS/09fShimes0tRLgDbOY2ZMG1Dckb6Zf/Q3Nxg4X0feFep44pvwNmyHKrOnl9SQ==",
|
||||
"requires": {
|
||||
"google-gax": "^4.0.3"
|
||||
}
|
||||
|
||||
@@ -28,7 +28,7 @@
|
||||
"dependencies": {
|
||||
"@aws-sdk/client-polly": "^3.496.0",
|
||||
"@aws-sdk/client-sts": "^3.496.0",
|
||||
"@google-cloud/text-to-speech": "^5.0.2",
|
||||
"@google-cloud/text-to-speech": "^5.5.0",
|
||||
"@grpc/grpc-js": "^1.9.14",
|
||||
"@jambonz/realtimedb-helpers": "^0.8.7",
|
||||
"bent": "^7.3.12",
|
||||
|
||||
@@ -91,14 +91,17 @@ test('Google speech Custom voice synth tests', async(t) => {
|
||||
const fn = require('..');
|
||||
const {synthAudio, client} = fn(opts, logger);
|
||||
|
||||
if (!process.env.GCP_CUSTOM_VOICE_FILE && !process.env.GCP_CUSTOM_VOICE_JSON_KEY || !process.env.GCP_CUSTOM_VOICE_MODEL) {
|
||||
t.pass('skipping google speech synth tests since neither GCP_CUSTOM_VOICE_FILE nor GCP_CUSTOM_VOICE_JSON_KEY provided, GCP_CUSTOM_VOICE_MODEL is not provided');
|
||||
if (!process.env.GCP_CUSTOM_VOICE_FILE &&
|
||||
!process.env.GCP_CUSTOM_VOICE_JSON_KEY ||
|
||||
!process.env.GCP_CUSTOM_VOICE_MODEL) {
|
||||
t.pass(`skipping google speech synth tests since neither
|
||||
GCP_CUSTOM_VOICE_FILE nor GCP_CUSTOM_VOICE_JSON_KEY provided, GCP_CUSTOM_VOICE_MODEL is not provided`);
|
||||
return t.end();
|
||||
}
|
||||
try {
|
||||
const str = process.env.GCP_CUSTOM_VOICE_JSON_KEY || fs.readFileSync(process.env.GCP_CUSTOM_VOICE_FILE);
|
||||
const creds = JSON.parse(str);
|
||||
let opts = await synthAudio(stats, {
|
||||
const opts = await synthAudio(stats, {
|
||||
vendor: 'google',
|
||||
credentials: {
|
||||
credentials: {
|
||||
@@ -109,7 +112,7 @@ test('Google speech Custom voice synth tests', async(t) => {
|
||||
language: 'en-AU',
|
||||
text: 'This is a test. This is only a test',
|
||||
voice: {
|
||||
reportedUsage:"REALTIME",
|
||||
reportedUsage: 'REALTIME',
|
||||
model: process.env.GCP_CUSTOM_VOICE_MODEL
|
||||
}
|
||||
});
|
||||
@@ -121,6 +124,48 @@ test('Google speech Custom voice synth tests', async(t) => {
|
||||
client.quit();
|
||||
});
|
||||
|
||||
test('Google speech voice cloning synth tests', async(t) => {
|
||||
const fn = require('..');
|
||||
const {synthAudio, client} = fn(opts, logger);
|
||||
|
||||
if (!process.env.GCP_CUSTOM_VOICE_FILE &&
|
||||
!process.env.GCP_CUSTOM_VOICE_JSON_KEY ||
|
||||
!process.env.GCP_VOICE_CLONING_FILE &&
|
||||
!process.env.GCP_VOICE_CLONING_JSON_KEY) {
|
||||
t.pass(`skipping google speech synth tests since neither
|
||||
GCP_CUSTOM_VOICE_FILE nor GCP_CUSTOM_VOICE_JSON_KEY provided,
|
||||
GCP_VOICE_CLONING_FILE nor GCP_VOICE_CLONING_JSON_KEY is not provided`);
|
||||
return t.end();
|
||||
}
|
||||
try {
|
||||
const googleKey = process.env.GCP_CUSTOM_VOICE_JSON_KEY ||
|
||||
fs.readFileSync(process.env.GCP_CUSTOM_VOICE_FILE);
|
||||
const voice_cloning_key = process.env.GCP_VOICE_CLONING_JSON_KEY ||
|
||||
fs.readFileSync(process.env.GCP_VOICE_CLONING_FILE).toString();
|
||||
const creds = JSON.parse(googleKey);
|
||||
const opts = await synthAudio(stats, {
|
||||
vendor: 'google',
|
||||
credentials: {
|
||||
credentials: {
|
||||
client_email: creds.client_email,
|
||||
private_key: creds.private_key,
|
||||
project_id: creds.project_id
|
||||
},
|
||||
},
|
||||
language: 'en-US',
|
||||
text: 'This is a test. This is only a test. This is a test. This is only a test. This is a test. This is only a test',
|
||||
voice: {
|
||||
voice_cloning_key
|
||||
}
|
||||
});
|
||||
t.ok(!opts.servedFromCache, `successfully synthesized google voice cloning audio to ${opts.filePath}`);
|
||||
} catch (err) {
|
||||
console.error(err);
|
||||
t.end(err);
|
||||
}
|
||||
client.quit();
|
||||
});
|
||||
|
||||
test('AWS speech synth tests', async(t) => {
|
||||
const fn = require('..');
|
||||
const {synthAudio, client} = fn(opts, logger);
|
||||
|
||||
Reference in New Issue
Block a user