support verbio speech (#323)

* support verbio speech

* wip

* update speech version

* update verb specification
This commit is contained in:
Hoan Luu Huu
2024-05-29 18:35:40 +07:00
committed by GitHub
parent ffe9cb23eb
commit d33d0aa519
8 changed files with 212 additions and 15 deletions

2
app.js
View File

@@ -54,6 +54,7 @@ const {
getTtsSize,
purgeTtsCache,
getAwsAuthToken,
getVerbioAccessToken,
synthAudio
} = require('@jambonz/speech-utils')({}, logger);
const {
@@ -99,6 +100,7 @@ app.locals = {
getTtsVoices,
getTtsSize,
getAwsAuthToken,
getVerbioAccessToken,
purgeTtsCache,
synthAudio,
lookupAppBySid,

View File

@@ -551,7 +551,7 @@
</location>
<size>
<width>293.00</width>
<height>560.00</height>
<height>540.00</height>
</size>
<zorder>6</zorder>
<SQLField>
@@ -3111,11 +3111,11 @@
<SourceSidebarWidth><![CDATA[312.000000]]></SourceSidebarWidth>
<SQLEditorFileFormatVersion><![CDATA[4]]></SQLEditorFileFormatVersion>
<uid><![CDATA[58C99A00-06C9-478C-A667-C63842E088F3]]></uid>
<windowHeight><![CDATA[870.000000]]></windowHeight>
<windowLocationX><![CDATA[-1164.000000]]></windowLocationX>
<windowLocationY><![CDATA[1131.000000]]></windowLocationY>
<windowScrollOrigin><![CDATA[{0.5, 0}]]></windowScrollOrigin>
<windowWidth><![CDATA[1512.000000]]></windowWidth>
<windowHeight><![CDATA[1027.000000]]></windowHeight>
<windowLocationX><![CDATA[1728.000000]]></windowLocationX>
<windowLocationY><![CDATA[65.000000]]></windowLocationY>
<windowScrollOrigin><![CDATA[{1, 0}]]></windowScrollOrigin>
<windowWidth><![CDATA[1675.000000]]></windowWidth>
</SQLDocumentInfo>
<AllowsIndexRenamingOnInsert><![CDATA[1]]></AllowsIndexRenamingOnInsert>
<defaultLabelExpanded><![CDATA[1]]></defaultLabelExpanded>

View File

@@ -8,7 +8,9 @@ const {parseAccountSid, parseServiceProviderSid, parseSpeechCredentialSid} = req
const {decryptCredential, testWhisper, testDeepgramTTS,
getLanguagesAndVoicesForVendor,
testPlayHT,
testRimelabs} = require('../../utils/speech-utils');
testRimelabs,
testVerbioTts,
testVerbioStt} = require('../../utils/speech-utils');
const {DbErrorUnprocessableRequest, DbErrorForbidden, DbErrorBadRequest} = require('../../utils/errors');
const {
testGoogleTts,
@@ -116,6 +118,7 @@ const encryptCredential = (obj) => {
role_arn,
region,
client_id,
client_secret,
secret,
nuance_tts_uri,
nuance_stt_uri,
@@ -140,6 +143,7 @@ const encryptCredential = (obj) => {
model_id,
user_id,
voice_engine,
engine_version,
options
} = obj;
@@ -255,6 +259,13 @@ const encryptCredential = (obj) => {
const whisperData = JSON.stringify({api_key, model_id});
return encrypt(whisperData);
case 'verbio':
assert(engine_version, 'invalid verbio speech credential: client_id is required');
assert(client_id, 'invalid verbio speech credential: client_id is required');
assert(client_secret, 'invalid verbio speech credential: secret is required');
const verbioData = JSON.stringify({client_id, client_secret, engine_version});
return encrypt(verbioData);
default:
if (vendor.startsWith('custom:')) {
const customData = JSON.stringify({auth_token, custom_stt_url, custom_tts_url});
@@ -501,7 +512,7 @@ router.put('/:sid', async(req, res) => {
* Test a credential
*/
router.get('/:sid/test', async(req, res) => {
const {logger, synthAudio} = req.app.locals;
const {logger, synthAudio, getVerbioAccessToken} = req.app.locals;
try {
const sid = parseSpeechCredentialSid(req);
const creds = await SpeechCredential.retrieve(sid);
@@ -672,8 +683,7 @@ router.get('/:sid/test', async(req, res) => {
SpeechCredential.sttTestResult(sid, false);
}
}
}
else if (cred.vendor === 'deepgram') {
} else if (cred.vendor === 'deepgram') {
const {api_key} = credential;
if (cred.use_for_tts) {
try {
@@ -803,6 +813,27 @@ router.get('/:sid/test', async(req, res) => {
SpeechCredential.ttsTestResult(sid, false);
}
}
} else if (cred.vendor === 'verbio') {
if (cred.use_for_tts) {
try {
await testVerbioTts(logger, synthAudio, credential);
results.tts.status = 'ok';
SpeechCredential.ttsTestResult(sid, true);
} catch (err) {
results.tts = {status: 'fail', reason: err.message};
SpeechCredential.ttsTestResult(sid, false);
}
}
if (cred.use_for_stt) {
try {
await testVerbioStt(logger, getVerbioAccessToken, credential);
results.stt.status = 'ok';
SpeechCredential.sttTestResult(sid, true);
} catch (err) {
results.stt = {status: 'fail', reason: err.message};
SpeechCredential.sttTestResult(sid, false);
}
}
}
res.status(200).json(results);

View File

@@ -0,0 +1,14 @@
module.exports = [
{ name: 'US English', value: 'en-US' },
{ name: 'British English', value: 'en-GB' },
{ name: 'LATAM Spanish', value: 'en-USes-419' },
{ name: 'Spanish', value: 'es' },
{ name: 'Catalan', value: 'ca-ES', version: 'v2' },
{ name: 'Brazilian Portuguese', value: 'pt-BR' },
{ name: 'French', value: 'fr', version: 'v1' },
{ name: 'Canadian French', value: 'fr-CA', version: 'v1' },
{ name: 'German', value: 'de', version: 'v1' },
{ name: 'Italian', value: 'it', version: 'v1' },
{ name: 'Turkish', value: 'tr', version: 'v1' },
{ name: 'Japanese', value: 'ja', version: 'v1' },
];

View File

@@ -0,0 +1,62 @@
module.exports = [
{
value: 'en-US',
name: 'US English',
voices: [
{
value: 'tommy_en_us',
name: 'Tommy-Male',
},
],
},
{
value: 'es-ES',
name: 'Castilian Spanish',
voices: [
{
value: 'david_es_es',
name: 'David-Male',
},
],
},
{
value: 'es-PE',
name: 'Peruvian Spanish',
voices: [
{
value: 'miguel_es_pe',
name: 'Miguel-Male',
},
],
},
{
value: 'es-PE',
name: 'Peruvian Spanish',
voices: [
{
value: 'luz_es_pe',
name: 'Luz-Female',
},
],
},
{
value: 'pt-BR',
name: 'Brazilian Portuguese',
voices: [
{
value: 'bel_pt_br',
name: 'Bel-Female',
},
],
},
{
value: 'ca-ES',
name: 'Catalan',
voices: [
{
value: 'anna_ca',
name: 'Anna-Female',
},
],
},
];

View File

@@ -18,6 +18,7 @@ const TtsNvidiaLanguagesVoices = require('./speech-data/tts-nvidia');
const TtsElevenlabsLanguagesVoices = require('./speech-data/tts-elevenlabs');
const TtsWhisperLanguagesVoices = require('./speech-data/tts-whisper');
const TtsPlayHtLanguagesVoices = require('./speech-data/tts-playht');
const TtsVerbioLanguagesVoices = require('./speech-data/tts-verbio');
const TtsModelDeepgram = require('./speech-data/tts-model-deepgram');
const TtsModelElevenLabs = require('./speech-data/tts-model-elevenlabs');
@@ -35,6 +36,7 @@ const SttNvidiaLanguagesVoices = require('./speech-data/stt-nvidia');
const SttCobaltLanguagesVoices = require('./speech-data/stt-cobalt');
const SttSonioxLanguagesVoices = require('./speech-data/stt-soniox');
const SttAssemblyaiLanguagesVoices = require('./speech-data/stt-assemblyai');
const SttVerbioLanguagesVoices = require('./speech-data/stt-verbio');
const testSonioxStt = async(logger, credentials) => {
const api_key = credentials;
@@ -366,6 +368,43 @@ const testWellSaidStt = async(logger, credentials) => {
return true;
};
const testVerbioTts = async(logger, synthAudio, credentials) => {
try {
await synthAudio(
{
increment: () => {},
histogram: () => {}
},
{
vendor: 'verbio',
credentials,
language: 'en-US',
voice: 'tommy_en-us',
text: 'Hi there and welcome to jambones!'
}
);
} catch (err) {
logger.info({err}, 'synth Verbio returned error');
throw err;
}
};
const testVerbioStt = async(logger, getVerbioAccessToken, credentials) => {
const token = await getVerbioAccessToken(credentials);
try {
const post = bent('https://us.rest.speechcenter.verbio.com', 'POST', 'json', {
'Authorization': `Bearer ${token.access_token}`,
'User-Agent': 'jambonz',
'Content-Type': 'audio/wav'
});
const json = await post('/api/v1/recognize?language=en-US&version=V1',
fs.readFileSync(`${__dirname}/../../data/test_audio.wav`));
logger.debug({json}, 'successfully speech to text from verbio');
} catch (err) {
logger.info({err}, 'testWellSaidTts returned error');
throw err;
}
};
const testAssemblyStt = async(logger, credentials) => {
const {api_key} = credentials;
@@ -512,6 +551,11 @@ function decryptCredential(obj, credential, logger, isObscureKey = true) {
const o = JSON.parse(decrypt(credential));
obj.api_key = isObscureKey ? obscureKey(o.api_key) : o.api_key;
obj.model_id = o.model_id;
} else if ('verbio' === obj.vendor) {
const o = JSON.parse(decrypt(credential));
obj.client_id = o.client_id;
obj.client_secret = isObscureKey ? obscureKey(o.client_secret) : o.client_secret;
obj.engine_version = o.engine_version;
}
}
@@ -568,6 +612,8 @@ async function getLanguagesAndVoicesForVendor(logger, vendor, credential, getTts
return await getLanguagesVoicesForAssemblyAI(credential, getTtsVoices, logger);
case 'whisper':
return await getLanguagesVoicesForWhisper(credential, getTtsVoices, logger);
case 'verbio':
return await getLanguagesVoicesForVerbio(credential, getTtsVoices, logger);
default:
logger.info(`invalid vendor ${vendor}, return empty result`);
throw new Error(`Invalid vendor ${vendor}`);
@@ -816,6 +862,23 @@ async function getLanguagesVoicesForWhisper(credential) {
return tranform(TtsWhisperLanguagesVoices, undefined, TtsModelWhisper);
}
async function getLanguagesVoicesForVerbio(credentials, getTtsVoices, logger) {
const stt = SttVerbioLanguagesVoices.reduce((acc, v) => {
if (!v.version || credentials.engine_version === v.version) {
acc.push(v);
}
return acc;
}, []);
try {
const data = await getTtsVoices({vendor: 'verbio', credentials});
const voices = parseVerbioLanguagesVoices(data);
return tranform(voices, stt, undefined);
} catch (err) {
logger.info({err}, 'there is error while fetching verbio speech voices');
return tranform(TtsVerbioLanguagesVoices, stt, undefined);
}
}
function tranform(tts, stt, models) {
return {
...(tts && {tts}),
@@ -943,6 +1006,29 @@ function parseMicrosoftLanguagesVoices(data) {
}, []);
}
function parseVerbioLanguagesVoices(data) {
return data.reduce((acc, voice) => {
const languageCode = voice.language;
const existingLanguage = acc.find((lang) => lang.value === languageCode);
if (existingLanguage) {
existingLanguage.voices.push({
value: voice.voice_id,
name: voice.name,
});
} else {
acc.push({
value: voice.language,
name: voice.language,
voices: [{
value: voice.voice_id,
name: voice.name,
}]
});
}
return acc;
}, []);
}
module.exports = {
testGoogleTts,
testGoogleStt,
@@ -966,5 +1052,7 @@ module.exports = {
getSpeechCredential,
decryptCredential,
testWhisper,
testVerbioTts,
testVerbioStt,
getLanguagesAndVoicesForVendor
};

8
package-lock.json generated
View File

@@ -21,7 +21,7 @@
"@jambonz/realtimedb-helpers": "^0.8.9",
"@jambonz/speech-utils": "^0.1.3",
"@jambonz/time-series": "^0.2.8",
"@jambonz/verb-specifications": "^0.0.69",
"@jambonz/verb-specifications": "^0.0.72",
"@soniox/soniox-node": "^1.2.2",
"argon2": "^0.40.1",
"assemblyai": "^4.3.4",
@@ -2082,9 +2082,9 @@
}
},
"node_modules/@jambonz/verb-specifications": {
"version": "0.0.69",
"resolved": "https://registry.npmjs.org/@jambonz/verb-specifications/-/verb-specifications-0.0.69.tgz",
"integrity": "sha512-DWnz7XRkCzpzyCVJH7NtScv+wSlUC414/EO8j/gPZs3RT4WBW1OBXwXpfjURHcSrDG7lycz+tfA+2WoUdW/W+g==",
"version": "0.0.72",
"resolved": "https://registry.npmjs.org/@jambonz/verb-specifications/-/verb-specifications-0.0.72.tgz",
"integrity": "sha512-sjA+/LQP2p1zE02UByy9OaAaSxbfQNxQ6D0pwYoMG42U8n+8Det+GFM/9+oFVnbNjUH9bvgT8vrR57U0lU4Cpw==",
"dependencies": {
"debug": "^4.3.4",
"pino": "^8.8.0"

View File

@@ -31,7 +31,7 @@
"@jambonz/realtimedb-helpers": "^0.8.9",
"@jambonz/speech-utils": "^0.1.3",
"@jambonz/time-series": "^0.2.8",
"@jambonz/verb-specifications": "^0.0.69",
"@jambonz/verb-specifications": "^0.0.72",
"@soniox/soniox-node": "^1.2.2",
"argon2": "^0.40.1",
"assemblyai": "^4.3.4",