mirror of
https://github.com/jambonz/jambonz-api-server.git
synced 2025-12-19 05:47:46 +00:00
support verbio speech (#323)
* support verbio speech * wip * update speech version * update verb specification
This commit is contained in:
2
app.js
2
app.js
@@ -54,6 +54,7 @@ const {
|
||||
getTtsSize,
|
||||
purgeTtsCache,
|
||||
getAwsAuthToken,
|
||||
getVerbioAccessToken,
|
||||
synthAudio
|
||||
} = require('@jambonz/speech-utils')({}, logger);
|
||||
const {
|
||||
@@ -99,6 +100,7 @@ app.locals = {
|
||||
getTtsVoices,
|
||||
getTtsSize,
|
||||
getAwsAuthToken,
|
||||
getVerbioAccessToken,
|
||||
purgeTtsCache,
|
||||
synthAudio,
|
||||
lookupAppBySid,
|
||||
|
||||
@@ -551,7 +551,7 @@
|
||||
</location>
|
||||
<size>
|
||||
<width>293.00</width>
|
||||
<height>560.00</height>
|
||||
<height>540.00</height>
|
||||
</size>
|
||||
<zorder>6</zorder>
|
||||
<SQLField>
|
||||
@@ -3111,11 +3111,11 @@
|
||||
<SourceSidebarWidth><![CDATA[312.000000]]></SourceSidebarWidth>
|
||||
<SQLEditorFileFormatVersion><![CDATA[4]]></SQLEditorFileFormatVersion>
|
||||
<uid><![CDATA[58C99A00-06C9-478C-A667-C63842E088F3]]></uid>
|
||||
<windowHeight><![CDATA[870.000000]]></windowHeight>
|
||||
<windowLocationX><![CDATA[-1164.000000]]></windowLocationX>
|
||||
<windowLocationY><![CDATA[1131.000000]]></windowLocationY>
|
||||
<windowScrollOrigin><![CDATA[{0.5, 0}]]></windowScrollOrigin>
|
||||
<windowWidth><![CDATA[1512.000000]]></windowWidth>
|
||||
<windowHeight><![CDATA[1027.000000]]></windowHeight>
|
||||
<windowLocationX><![CDATA[1728.000000]]></windowLocationX>
|
||||
<windowLocationY><![CDATA[65.000000]]></windowLocationY>
|
||||
<windowScrollOrigin><![CDATA[{1, 0}]]></windowScrollOrigin>
|
||||
<windowWidth><![CDATA[1675.000000]]></windowWidth>
|
||||
</SQLDocumentInfo>
|
||||
<AllowsIndexRenamingOnInsert><![CDATA[1]]></AllowsIndexRenamingOnInsert>
|
||||
<defaultLabelExpanded><![CDATA[1]]></defaultLabelExpanded>
|
||||
|
||||
@@ -8,7 +8,9 @@ const {parseAccountSid, parseServiceProviderSid, parseSpeechCredentialSid} = req
|
||||
const {decryptCredential, testWhisper, testDeepgramTTS,
|
||||
getLanguagesAndVoicesForVendor,
|
||||
testPlayHT,
|
||||
testRimelabs} = require('../../utils/speech-utils');
|
||||
testRimelabs,
|
||||
testVerbioTts,
|
||||
testVerbioStt} = require('../../utils/speech-utils');
|
||||
const {DbErrorUnprocessableRequest, DbErrorForbidden, DbErrorBadRequest} = require('../../utils/errors');
|
||||
const {
|
||||
testGoogleTts,
|
||||
@@ -116,6 +118,7 @@ const encryptCredential = (obj) => {
|
||||
role_arn,
|
||||
region,
|
||||
client_id,
|
||||
client_secret,
|
||||
secret,
|
||||
nuance_tts_uri,
|
||||
nuance_stt_uri,
|
||||
@@ -140,6 +143,7 @@ const encryptCredential = (obj) => {
|
||||
model_id,
|
||||
user_id,
|
||||
voice_engine,
|
||||
engine_version,
|
||||
options
|
||||
} = obj;
|
||||
|
||||
@@ -255,6 +259,13 @@ const encryptCredential = (obj) => {
|
||||
const whisperData = JSON.stringify({api_key, model_id});
|
||||
return encrypt(whisperData);
|
||||
|
||||
case 'verbio':
|
||||
assert(engine_version, 'invalid verbio speech credential: client_id is required');
|
||||
assert(client_id, 'invalid verbio speech credential: client_id is required');
|
||||
assert(client_secret, 'invalid verbio speech credential: secret is required');
|
||||
const verbioData = JSON.stringify({client_id, client_secret, engine_version});
|
||||
return encrypt(verbioData);
|
||||
|
||||
default:
|
||||
if (vendor.startsWith('custom:')) {
|
||||
const customData = JSON.stringify({auth_token, custom_stt_url, custom_tts_url});
|
||||
@@ -501,7 +512,7 @@ router.put('/:sid', async(req, res) => {
|
||||
* Test a credential
|
||||
*/
|
||||
router.get('/:sid/test', async(req, res) => {
|
||||
const {logger, synthAudio} = req.app.locals;
|
||||
const {logger, synthAudio, getVerbioAccessToken} = req.app.locals;
|
||||
try {
|
||||
const sid = parseSpeechCredentialSid(req);
|
||||
const creds = await SpeechCredential.retrieve(sid);
|
||||
@@ -672,8 +683,7 @@ router.get('/:sid/test', async(req, res) => {
|
||||
SpeechCredential.sttTestResult(sid, false);
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (cred.vendor === 'deepgram') {
|
||||
} else if (cred.vendor === 'deepgram') {
|
||||
const {api_key} = credential;
|
||||
if (cred.use_for_tts) {
|
||||
try {
|
||||
@@ -803,6 +813,27 @@ router.get('/:sid/test', async(req, res) => {
|
||||
SpeechCredential.ttsTestResult(sid, false);
|
||||
}
|
||||
}
|
||||
} else if (cred.vendor === 'verbio') {
|
||||
if (cred.use_for_tts) {
|
||||
try {
|
||||
await testVerbioTts(logger, synthAudio, credential);
|
||||
results.tts.status = 'ok';
|
||||
SpeechCredential.ttsTestResult(sid, true);
|
||||
} catch (err) {
|
||||
results.tts = {status: 'fail', reason: err.message};
|
||||
SpeechCredential.ttsTestResult(sid, false);
|
||||
}
|
||||
}
|
||||
if (cred.use_for_stt) {
|
||||
try {
|
||||
await testVerbioStt(logger, getVerbioAccessToken, credential);
|
||||
results.stt.status = 'ok';
|
||||
SpeechCredential.sttTestResult(sid, true);
|
||||
} catch (err) {
|
||||
results.stt = {status: 'fail', reason: err.message};
|
||||
SpeechCredential.sttTestResult(sid, false);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
res.status(200).json(results);
|
||||
|
||||
14
lib/utils/speech-data/stt-verbio.js
Normal file
14
lib/utils/speech-data/stt-verbio.js
Normal file
@@ -0,0 +1,14 @@
|
||||
module.exports = [
|
||||
{ name: 'US English', value: 'en-US' },
|
||||
{ name: 'British English', value: 'en-GB' },
|
||||
{ name: 'LATAM Spanish', value: 'en-USes-419' },
|
||||
{ name: 'Spanish', value: 'es' },
|
||||
{ name: 'Catalan', value: 'ca-ES', version: 'v2' },
|
||||
{ name: 'Brazilian Portuguese', value: 'pt-BR' },
|
||||
{ name: 'French', value: 'fr', version: 'v1' },
|
||||
{ name: 'Canadian French', value: 'fr-CA', version: 'v1' },
|
||||
{ name: 'German', value: 'de', version: 'v1' },
|
||||
{ name: 'Italian', value: 'it', version: 'v1' },
|
||||
{ name: 'Turkish', value: 'tr', version: 'v1' },
|
||||
{ name: 'Japanese', value: 'ja', version: 'v1' },
|
||||
];
|
||||
62
lib/utils/speech-data/tts-verbio.js
Normal file
62
lib/utils/speech-data/tts-verbio.js
Normal file
@@ -0,0 +1,62 @@
|
||||
module.exports = [
|
||||
{
|
||||
value: 'en-US',
|
||||
name: 'US English',
|
||||
voices: [
|
||||
{
|
||||
value: 'tommy_en_us',
|
||||
name: 'Tommy-Male',
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
value: 'es-ES',
|
||||
name: 'Castilian Spanish',
|
||||
voices: [
|
||||
{
|
||||
value: 'david_es_es',
|
||||
name: 'David-Male',
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
value: 'es-PE',
|
||||
name: 'Peruvian Spanish',
|
||||
voices: [
|
||||
{
|
||||
value: 'miguel_es_pe',
|
||||
name: 'Miguel-Male',
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
value: 'es-PE',
|
||||
name: 'Peruvian Spanish',
|
||||
voices: [
|
||||
{
|
||||
value: 'luz_es_pe',
|
||||
name: 'Luz-Female',
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
value: 'pt-BR',
|
||||
name: 'Brazilian Portuguese',
|
||||
voices: [
|
||||
{
|
||||
value: 'bel_pt_br',
|
||||
name: 'Bel-Female',
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
value: 'ca-ES',
|
||||
name: 'Catalan',
|
||||
voices: [
|
||||
{
|
||||
value: 'anna_ca',
|
||||
name: 'Anna-Female',
|
||||
},
|
||||
],
|
||||
},
|
||||
];
|
||||
@@ -18,6 +18,7 @@ const TtsNvidiaLanguagesVoices = require('./speech-data/tts-nvidia');
|
||||
const TtsElevenlabsLanguagesVoices = require('./speech-data/tts-elevenlabs');
|
||||
const TtsWhisperLanguagesVoices = require('./speech-data/tts-whisper');
|
||||
const TtsPlayHtLanguagesVoices = require('./speech-data/tts-playht');
|
||||
const TtsVerbioLanguagesVoices = require('./speech-data/tts-verbio');
|
||||
|
||||
const TtsModelDeepgram = require('./speech-data/tts-model-deepgram');
|
||||
const TtsModelElevenLabs = require('./speech-data/tts-model-elevenlabs');
|
||||
@@ -35,6 +36,7 @@ const SttNvidiaLanguagesVoices = require('./speech-data/stt-nvidia');
|
||||
const SttCobaltLanguagesVoices = require('./speech-data/stt-cobalt');
|
||||
const SttSonioxLanguagesVoices = require('./speech-data/stt-soniox');
|
||||
const SttAssemblyaiLanguagesVoices = require('./speech-data/stt-assemblyai');
|
||||
const SttVerbioLanguagesVoices = require('./speech-data/stt-verbio');
|
||||
|
||||
const testSonioxStt = async(logger, credentials) => {
|
||||
const api_key = credentials;
|
||||
@@ -366,6 +368,43 @@ const testWellSaidStt = async(logger, credentials) => {
|
||||
return true;
|
||||
};
|
||||
|
||||
const testVerbioTts = async(logger, synthAudio, credentials) => {
|
||||
try {
|
||||
await synthAudio(
|
||||
{
|
||||
increment: () => {},
|
||||
histogram: () => {}
|
||||
},
|
||||
{
|
||||
vendor: 'verbio',
|
||||
credentials,
|
||||
language: 'en-US',
|
||||
voice: 'tommy_en-us',
|
||||
text: 'Hi there and welcome to jambones!'
|
||||
}
|
||||
);
|
||||
} catch (err) {
|
||||
logger.info({err}, 'synth Verbio returned error');
|
||||
throw err;
|
||||
}
|
||||
};
|
||||
const testVerbioStt = async(logger, getVerbioAccessToken, credentials) => {
|
||||
const token = await getVerbioAccessToken(credentials);
|
||||
try {
|
||||
const post = bent('https://us.rest.speechcenter.verbio.com', 'POST', 'json', {
|
||||
'Authorization': `Bearer ${token.access_token}`,
|
||||
'User-Agent': 'jambonz',
|
||||
'Content-Type': 'audio/wav'
|
||||
});
|
||||
const json = await post('/api/v1/recognize?language=en-US&version=V1',
|
||||
fs.readFileSync(`${__dirname}/../../data/test_audio.wav`));
|
||||
logger.debug({json}, 'successfully speech to text from verbio');
|
||||
} catch (err) {
|
||||
logger.info({err}, 'testWellSaidTts returned error');
|
||||
throw err;
|
||||
}
|
||||
};
|
||||
|
||||
const testAssemblyStt = async(logger, credentials) => {
|
||||
const {api_key} = credentials;
|
||||
|
||||
@@ -512,6 +551,11 @@ function decryptCredential(obj, credential, logger, isObscureKey = true) {
|
||||
const o = JSON.parse(decrypt(credential));
|
||||
obj.api_key = isObscureKey ? obscureKey(o.api_key) : o.api_key;
|
||||
obj.model_id = o.model_id;
|
||||
} else if ('verbio' === obj.vendor) {
|
||||
const o = JSON.parse(decrypt(credential));
|
||||
obj.client_id = o.client_id;
|
||||
obj.client_secret = isObscureKey ? obscureKey(o.client_secret) : o.client_secret;
|
||||
obj.engine_version = o.engine_version;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -568,6 +612,8 @@ async function getLanguagesAndVoicesForVendor(logger, vendor, credential, getTts
|
||||
return await getLanguagesVoicesForAssemblyAI(credential, getTtsVoices, logger);
|
||||
case 'whisper':
|
||||
return await getLanguagesVoicesForWhisper(credential, getTtsVoices, logger);
|
||||
case 'verbio':
|
||||
return await getLanguagesVoicesForVerbio(credential, getTtsVoices, logger);
|
||||
default:
|
||||
logger.info(`invalid vendor ${vendor}, return empty result`);
|
||||
throw new Error(`Invalid vendor ${vendor}`);
|
||||
@@ -816,6 +862,23 @@ async function getLanguagesVoicesForWhisper(credential) {
|
||||
return tranform(TtsWhisperLanguagesVoices, undefined, TtsModelWhisper);
|
||||
}
|
||||
|
||||
async function getLanguagesVoicesForVerbio(credentials, getTtsVoices, logger) {
|
||||
const stt = SttVerbioLanguagesVoices.reduce((acc, v) => {
|
||||
if (!v.version || credentials.engine_version === v.version) {
|
||||
acc.push(v);
|
||||
}
|
||||
return acc;
|
||||
}, []);
|
||||
try {
|
||||
const data = await getTtsVoices({vendor: 'verbio', credentials});
|
||||
const voices = parseVerbioLanguagesVoices(data);
|
||||
return tranform(voices, stt, undefined);
|
||||
} catch (err) {
|
||||
logger.info({err}, 'there is error while fetching verbio speech voices');
|
||||
return tranform(TtsVerbioLanguagesVoices, stt, undefined);
|
||||
}
|
||||
}
|
||||
|
||||
function tranform(tts, stt, models) {
|
||||
return {
|
||||
...(tts && {tts}),
|
||||
@@ -943,6 +1006,29 @@ function parseMicrosoftLanguagesVoices(data) {
|
||||
}, []);
|
||||
}
|
||||
|
||||
function parseVerbioLanguagesVoices(data) {
|
||||
return data.reduce((acc, voice) => {
|
||||
const languageCode = voice.language;
|
||||
const existingLanguage = acc.find((lang) => lang.value === languageCode);
|
||||
if (existingLanguage) {
|
||||
existingLanguage.voices.push({
|
||||
value: voice.voice_id,
|
||||
name: voice.name,
|
||||
});
|
||||
} else {
|
||||
acc.push({
|
||||
value: voice.language,
|
||||
name: voice.language,
|
||||
voices: [{
|
||||
value: voice.voice_id,
|
||||
name: voice.name,
|
||||
}]
|
||||
});
|
||||
}
|
||||
return acc;
|
||||
}, []);
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
testGoogleTts,
|
||||
testGoogleStt,
|
||||
@@ -966,5 +1052,7 @@ module.exports = {
|
||||
getSpeechCredential,
|
||||
decryptCredential,
|
||||
testWhisper,
|
||||
testVerbioTts,
|
||||
testVerbioStt,
|
||||
getLanguagesAndVoicesForVendor
|
||||
};
|
||||
|
||||
8
package-lock.json
generated
8
package-lock.json
generated
@@ -21,7 +21,7 @@
|
||||
"@jambonz/realtimedb-helpers": "^0.8.9",
|
||||
"@jambonz/speech-utils": "^0.1.3",
|
||||
"@jambonz/time-series": "^0.2.8",
|
||||
"@jambonz/verb-specifications": "^0.0.69",
|
||||
"@jambonz/verb-specifications": "^0.0.72",
|
||||
"@soniox/soniox-node": "^1.2.2",
|
||||
"argon2": "^0.40.1",
|
||||
"assemblyai": "^4.3.4",
|
||||
@@ -2082,9 +2082,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/@jambonz/verb-specifications": {
|
||||
"version": "0.0.69",
|
||||
"resolved": "https://registry.npmjs.org/@jambonz/verb-specifications/-/verb-specifications-0.0.69.tgz",
|
||||
"integrity": "sha512-DWnz7XRkCzpzyCVJH7NtScv+wSlUC414/EO8j/gPZs3RT4WBW1OBXwXpfjURHcSrDG7lycz+tfA+2WoUdW/W+g==",
|
||||
"version": "0.0.72",
|
||||
"resolved": "https://registry.npmjs.org/@jambonz/verb-specifications/-/verb-specifications-0.0.72.tgz",
|
||||
"integrity": "sha512-sjA+/LQP2p1zE02UByy9OaAaSxbfQNxQ6D0pwYoMG42U8n+8Det+GFM/9+oFVnbNjUH9bvgT8vrR57U0lU4Cpw==",
|
||||
"dependencies": {
|
||||
"debug": "^4.3.4",
|
||||
"pino": "^8.8.0"
|
||||
|
||||
@@ -31,7 +31,7 @@
|
||||
"@jambonz/realtimedb-helpers": "^0.8.9",
|
||||
"@jambonz/speech-utils": "^0.1.3",
|
||||
"@jambonz/time-series": "^0.2.8",
|
||||
"@jambonz/verb-specifications": "^0.0.69",
|
||||
"@jambonz/verb-specifications": "^0.0.72",
|
||||
"@soniox/soniox-node": "^1.2.2",
|
||||
"argon2": "^0.40.1",
|
||||
"assemblyai": "^4.3.4",
|
||||
|
||||
Reference in New Issue
Block a user