mirror of
https://github.com/jambonz/jambonz-api-server.git
synced 2025-12-18 21:37:43 +00:00
support gladia stt (#503)
* support gladia stt * wip * update verb specification
This commit is contained in:
@@ -17,7 +17,8 @@ const {decryptCredential, testWhisper, testDeepgramTTS,
|
||||
testOpenAiStt,
|
||||
testInworld,
|
||||
testResembleTTS,
|
||||
testHoundifyStt} = require('../../utils/speech-utils');
|
||||
testHoundifyStt,
|
||||
testGladiaStt} = require('../../utils/speech-utils');
|
||||
const {DbErrorUnprocessableRequest, DbErrorForbidden, DbErrorBadRequest} = require('../../utils/errors');
|
||||
const {
|
||||
testGoogleTts,
|
||||
@@ -232,6 +233,10 @@ const encryptCredential = (obj) => {
|
||||
deepgram_stt_use_tls, deepgram_tts_uri, model_id});
|
||||
return encrypt(deepgramData);
|
||||
|
||||
case 'gladia':
|
||||
const gladiaData = JSON.stringify({api_key, region});
|
||||
return encrypt(gladiaData);
|
||||
|
||||
case 'resemble':
|
||||
assert(api_key, 'invalid resemble speech credential: api_key is required');
|
||||
const resembleData = JSON.stringify({
|
||||
@@ -835,6 +840,18 @@ router.get('/:sid/test', async(req, res) => {
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (cred.vendor === 'gladia') {
|
||||
if (cred.use_for_stt) {
|
||||
try {
|
||||
await testGladiaStt(logger, credential);
|
||||
results.stt.status = 'ok';
|
||||
SpeechCredential.sttTestResult(sid, true);
|
||||
} catch (err) {
|
||||
results.stt = {status: 'fail', reason: err.message};
|
||||
SpeechCredential.sttTestResult(sid, false);
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (cred.vendor === 'ibm') {
|
||||
const {getTtsVoices} = req.app.locals;
|
||||
|
||||
|
||||
103
lib/utils/speech-data/stt-gladia.js
Normal file
103
lib/utils/speech-data/stt-gladia.js
Normal file
@@ -0,0 +1,103 @@
|
||||
module.exports = [
|
||||
{ name: 'Afrikaans', value: 'af' },
|
||||
{ name: 'Albanian', value: 'sq' },
|
||||
{ name: 'Amharic', value: 'am' },
|
||||
{ name: 'Arabic', value: 'ar' },
|
||||
{ name: 'Armenian', value: 'hy' },
|
||||
{ name: 'Assamese', value: 'as' },
|
||||
{ name: 'Azerbaijani', value: 'az' },
|
||||
{ name: 'Bashkir', value: 'ba' },
|
||||
{ name: 'Basque', value: 'eu' },
|
||||
{ name: 'Belarusian', value: 'be' },
|
||||
{ name: 'Bengali', value: 'bn' },
|
||||
{ name: 'Bosnian', value: 'bs' },
|
||||
{ name: 'Breton', value: 'br' },
|
||||
{ name: 'Bulgarian', value: 'bg' },
|
||||
{ name: 'Cantonese', value: 'yue' },
|
||||
{ name: 'Catalan', value: 'ca' },
|
||||
{ name: 'Chinese', value: 'zh' },
|
||||
{ name: 'Croatian', value: 'hr' },
|
||||
{ name: 'Czech', value: 'cs' },
|
||||
{ name: 'Danish', value: 'da' },
|
||||
{ name: 'Dutch', value: 'nl' },
|
||||
{ name: 'English', value: 'en' },
|
||||
{ name: 'Estonian', value: 'et' },
|
||||
{ name: 'Faroese', value: 'fo' },
|
||||
{ name: 'Finnish', value: 'fi' },
|
||||
{ name: 'French', value: 'fr' },
|
||||
{ name: 'Galician', value: 'gl' },
|
||||
{ name: 'Georgian', value: 'ka' },
|
||||
{ name: 'German', value: 'de' },
|
||||
{ name: 'Greek', value: 'el' },
|
||||
{ name: 'Gujarati', value: 'gu' },
|
||||
{ name: 'Haitian Creole', value: 'ht' },
|
||||
{ name: 'Hausa', value: 'ha' },
|
||||
{ name: 'Hawaiian', value: 'haw' },
|
||||
{ name: 'Hebrew', value: 'he' },
|
||||
{ name: 'Hindi', value: 'hi' },
|
||||
{ name: 'Hungarian', value: 'hu' },
|
||||
{ name: 'Icelandic', value: 'is' },
|
||||
{ name: 'Indonesian', value: 'id' },
|
||||
{ name: 'Italian', value: 'it' },
|
||||
{ name: 'Japanese', value: 'ja' },
|
||||
{ name: 'Javanese', value: 'jw' },
|
||||
{ name: 'Kannada', value: 'kn' },
|
||||
{ name: 'Kazakh', value: 'kk' },
|
||||
{ name: 'Khmer', value: 'km' },
|
||||
{ name: 'Korean', value: 'ko' },
|
||||
{ name: 'Lao', value: 'lo' },
|
||||
{ name: 'Latin', value: 'la' },
|
||||
{ name: 'Latvian', value: 'lv' },
|
||||
{ name: 'Lingala', value: 'ln' },
|
||||
{ name: 'Lithuanian', value: 'lt' },
|
||||
{ name: 'Luxembourgish', value: 'lb' },
|
||||
{ name: 'Macedonian', value: 'mk' },
|
||||
{ name: 'Malagasy', value: 'mg' },
|
||||
{ name: 'Malay', value: 'ms' },
|
||||
{ name: 'Malayalam', value: 'ml' },
|
||||
{ name: 'Maltese', value: 'mt' },
|
||||
{ name: 'Maori', value: 'mi' },
|
||||
{ name: 'Marathi', value: 'mr' },
|
||||
{ name: 'Mongolian', value: 'mn' },
|
||||
{ name: 'Myanmar', value: 'my' },
|
||||
{ name: 'Nepali', value: 'ne' },
|
||||
{ name: 'Norwegian', value: 'no' },
|
||||
{ name: 'Nynorsk', value: 'nn' },
|
||||
{ name: 'Occitan', value: 'oc' },
|
||||
{ name: 'Pashto', value: 'ps' },
|
||||
{ name: 'Persian', value: 'fa' },
|
||||
{ name: 'Polish', value: 'pl' },
|
||||
{ name: 'Portuguese', value: 'pt' },
|
||||
{ name: 'Punjabi', value: 'pa' },
|
||||
{ name: 'Romanian', value: 'ro' },
|
||||
{ name: 'Russian', value: 'ru' },
|
||||
{ name: 'Sanskrit', value: 'sa' },
|
||||
{ name: 'Serbian', value: 'sr' },
|
||||
{ name: 'Shona', value: 'sn' },
|
||||
{ name: 'Sindhi', value: 'sd' },
|
||||
{ name: 'Sinhala', value: 'si' },
|
||||
{ name: 'Slovak', value: 'sk' },
|
||||
{ name: 'Slovenian', value: 'sl' },
|
||||
{ name: 'Somali', value: 'so' },
|
||||
{ name: 'Spanish', value: 'es' },
|
||||
{ name: 'Sundanese', value: 'su' },
|
||||
{ name: 'Swahili', value: 'sw' },
|
||||
{ name: 'Swedish', value: 'sv' },
|
||||
{ name: 'Tagalog', value: 'tl' },
|
||||
{ name: 'Tajik', value: 'tg' },
|
||||
{ name: 'Tamil', value: 'ta' },
|
||||
{ name: 'Tatar', value: 'tt' },
|
||||
{ name: 'Telugu', value: 'te' },
|
||||
{ name: 'Thai', value: 'th' },
|
||||
{ name: 'Tibetan', value: 'bo' },
|
||||
{ name: 'Turkish', value: 'tr' },
|
||||
{ name: 'Turkmen', value: 'tk' },
|
||||
{ name: 'Ukrainian', value: 'uk' },
|
||||
{ name: 'Urdu', value: 'ur' },
|
||||
{ name: 'Uzbek', value: 'uz' },
|
||||
{ name: 'Vietnamese', value: 'vi' },
|
||||
{ name: 'Welsh', value: 'cy' },
|
||||
{ name: 'Wolof', value: 'wo' },
|
||||
{ name: 'Yiddish', value: 'yi' },
|
||||
{ name: 'Yoruba', value: 'yo' }
|
||||
];
|
||||
@@ -6,6 +6,7 @@ const { SpeechClient } = require('@soniox/soniox-node');
|
||||
const fs = require('fs');
|
||||
const { AssemblyAI } = require('assemblyai');
|
||||
const Houndify = require('houndify');
|
||||
const { GladiaClient } = require('@gladiaio/sdk');
|
||||
const {decrypt, obscureKey} = require('./encrypt-decrypt');
|
||||
const { RealtimeSession } = require('speechmatics');
|
||||
|
||||
@@ -50,6 +51,7 @@ const SttHoundifyLanguagesVoices = require('./speech-data/stt-houndify');
|
||||
const SttVoxistLanguagesVoices = require('./speech-data/stt-voxist');
|
||||
const SttVerbioLanguagesVoices = require('./speech-data/stt-verbio');
|
||||
const SttOpenaiLanguagesVoices = require('./speech-data/stt-openai');
|
||||
const SttGladiaLanguagesVoices = require('./speech-data/stt-gladia');
|
||||
|
||||
|
||||
const SttModelOpenai = require('./speech-data/stt-model-openai');
|
||||
@@ -170,6 +172,65 @@ const testGoogleStt = async(logger, credentials) => {
|
||||
}
|
||||
};
|
||||
|
||||
const testGladiaStt = async(logger, credentials) => {
|
||||
const {api_key} = credentials;
|
||||
|
||||
try {
|
||||
const gladiaClient = new GladiaClient({
|
||||
apiKey: api_key,
|
||||
});
|
||||
const gladiaConfig = {
|
||||
model: 'solaria-1',
|
||||
encoding: 'wav/pcm',
|
||||
sample_rate: 16000,
|
||||
bit_depth: 16,
|
||||
channels: 1,
|
||||
language_config: {
|
||||
languages: ['en'],
|
||||
code_switching: false,
|
||||
},
|
||||
};
|
||||
// Start the live session
|
||||
const liveSession = gladiaClient.liveV2().startSession(gladiaConfig);
|
||||
// Read the test audio file
|
||||
const audioBuffer = fs.readFileSync(`${__dirname}/../../data/test_audio.wav`);
|
||||
|
||||
// Wait for final transcript
|
||||
return new Promise((resolve, reject) => {
|
||||
liveSession.on('message', (message) => {
|
||||
if (message.type === 'transcript' && message.data.is_final) {
|
||||
logger.debug(`${message.data.id}: ${message.data.utterance.text}`);
|
||||
liveSession.stopRecording();
|
||||
resolve(message.data.utterance.text);
|
||||
}
|
||||
});
|
||||
|
||||
liveSession.on('error', (error) => {
|
||||
logger.error({error}, 'Gladia Live STT error');
|
||||
reject(error);
|
||||
});
|
||||
|
||||
// Send audio in chunks
|
||||
const chunkSize = 1024;
|
||||
for (let i = 0; i < audioBuffer.length; i += chunkSize) {
|
||||
const chunk = audioBuffer.slice(i, i + chunkSize);
|
||||
liveSession.sendAudio(chunk);
|
||||
}
|
||||
// Stop recording after sending all audio
|
||||
liveSession.stopRecording();
|
||||
|
||||
// Set a timeout to prevent hanging
|
||||
setTimeout(() => {
|
||||
reject(new Error('Gladia STT test timeout'));
|
||||
}, 30000); // 30 second timeout
|
||||
});
|
||||
|
||||
} catch (error) {
|
||||
logger.error({error}, 'Failed to create Gladia Live STT session');
|
||||
throw error;
|
||||
}
|
||||
};
|
||||
|
||||
const testDeepgramStt = async(logger, credentials) => {
|
||||
const {api_key, deepgram_stt_uri, deepgram_stt_use_tls} = credentials;
|
||||
const deepgram = new Deepgram(api_key, deepgram_stt_uri, deepgram_stt_uri && deepgram_stt_use_tls);
|
||||
@@ -758,6 +819,10 @@ function decryptCredential(obj, credential, logger, isObscureKey = true) {
|
||||
const o = JSON.parse(decrypt(credential));
|
||||
obj.api_key = isObscureKey ? obscureKey(o.api_key) : o.api_key;
|
||||
}
|
||||
else if ('gladia' === obj.vendor) {
|
||||
const o = JSON.parse(decrypt(credential));
|
||||
obj.api_key = isObscureKey ? obscureKey(o.api_key) : o.api_key;
|
||||
}
|
||||
else if ('ibm' === obj.vendor) {
|
||||
const o = JSON.parse(decrypt(credential));
|
||||
obj.tts_api_key = isObscureKey ? obscureKey(o.tts_api_key) : o.tts_api_key;
|
||||
@@ -881,6 +946,8 @@ async function getLanguagesAndVoicesForVendor(logger, vendor, credential, getTts
|
||||
return await getLanguagesVoicesForNuane(credential, getTtsVoices, logger);
|
||||
case 'deepgram':
|
||||
return await getLanguagesVoicesForDeepgram(credential, getTtsVoices, logger);
|
||||
case 'gladia':
|
||||
return await getLanguagesVoicesForGladia(credential, getTtsVoices, logger);
|
||||
case 'ibm':
|
||||
return await getLanguagesVoicesForIbm(credential, getTtsVoices, logger);
|
||||
case 'nvidia':
|
||||
@@ -1052,6 +1119,11 @@ async function getLanguagesVoicesForDeepgram(credential, getTtsVoices, logger) {
|
||||
TtsModelDeepgram, sttModelDeepgram.sort((a, b) => a.name.localeCompare(b.name)));
|
||||
}
|
||||
|
||||
async function getLanguagesVoicesForGladia(credential, getTtsVoices, logger) {
|
||||
return tranform(undefined, SttGladiaLanguagesVoices.sort((a, b) => a.name.localeCompare(b.name)),
|
||||
undefined, undefined);
|
||||
}
|
||||
|
||||
async function getLanguagesVoicesForIbm(credential, getTtsVoices, logger) {
|
||||
if (credential) {
|
||||
try {
|
||||
@@ -1706,6 +1778,7 @@ module.exports = {
|
||||
testNuanceTts,
|
||||
testNuanceStt,
|
||||
testDeepgramStt,
|
||||
testGladiaStt,
|
||||
testIbmTts,
|
||||
testIbmStt,
|
||||
testSonioxStt,
|
||||
|
||||
38
package-lock.json
generated
38
package-lock.json
generated
@@ -14,6 +14,7 @@
|
||||
"@aws-sdk/client-transcribe": "^3.549.0",
|
||||
"@azure/storage-blob": "^12.17.0",
|
||||
"@deepgram/sdk": "^1.21.0",
|
||||
"@gladiaio/sdk": "^0.5.2",
|
||||
"@google-cloud/speech": "^6.5.0",
|
||||
"@google-cloud/storage": "^7.9.0",
|
||||
"@jambonz/db-helpers": "^0.9.18",
|
||||
@@ -22,7 +23,7 @@
|
||||
"@jambonz/realtimedb-helpers": "^0.8.15",
|
||||
"@jambonz/speech-utils": "^0.2.25",
|
||||
"@jambonz/time-series": "^0.2.8",
|
||||
"@jambonz/verb-specifications": "^0.0.117",
|
||||
"@jambonz/verb-specifications": "^0.0.118",
|
||||
"@soniox/soniox-node": "^1.2.2",
|
||||
"ajv": "^8.17.1",
|
||||
"argon2": "^0.40.1",
|
||||
@@ -3793,6 +3794,28 @@
|
||||
"node": "^12.22.0 || ^14.17.0 || >=16.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/@gladiaio/sdk": {
|
||||
"version": "0.5.2",
|
||||
"resolved": "https://registry.npmjs.org/@gladiaio/sdk/-/sdk-0.5.2.tgz",
|
||||
"integrity": "sha512-v51y75+5Wg/YWVQlTXGdOFiKTrYAHdWR3xZoCqLQdWPyBlynVbiTPf4IpID0zAP/ngu16GGmBVC/XOQZfcj+hg==",
|
||||
"license": "MIT",
|
||||
"engines": {
|
||||
"node": ">=20"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"eventemitter3": ">=5",
|
||||
"undici": ">=5",
|
||||
"ws": "5.2.4 || 6.2.3 || 7.5.10 || >=8.8"
|
||||
},
|
||||
"peerDependenciesMeta": {
|
||||
"undici": {
|
||||
"optional": true
|
||||
},
|
||||
"ws": {
|
||||
"optional": true
|
||||
}
|
||||
}
|
||||
},
|
||||
"node_modules/@google-cloud/common": {
|
||||
"version": "5.0.1",
|
||||
"resolved": "https://registry.npmjs.org/@google-cloud/common/-/common-5.0.1.tgz",
|
||||
@@ -4323,9 +4346,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/@jambonz/verb-specifications": {
|
||||
"version": "0.0.117",
|
||||
"resolved": "https://registry.npmjs.org/@jambonz/verb-specifications/-/verb-specifications-0.0.117.tgz",
|
||||
"integrity": "sha512-yfnHWfqVRyE9ICBdVQV8CFwj0jDInpgMYdv9lI5c4iOmnYrJU3e/iFdrwM+abhFyZGL1Can9onynPW6RZFXXsw==",
|
||||
"version": "0.0.118",
|
||||
"resolved": "https://registry.npmjs.org/@jambonz/verb-specifications/-/verb-specifications-0.0.118.tgz",
|
||||
"integrity": "sha512-1dGnc6TUCehjt1yGNuqh1uzk1xw9HhUm39aVUosQMHlnT0fK0ItikeJ0uttTjFastHNmPPxqJwb20wOvVGTCFg==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"debug": "^4.3.4",
|
||||
@@ -7356,6 +7379,13 @@
|
||||
"node": ">=6"
|
||||
}
|
||||
},
|
||||
"node_modules/eventemitter3": {
|
||||
"version": "5.0.1",
|
||||
"resolved": "https://registry.npmjs.org/eventemitter3/-/eventemitter3-5.0.1.tgz",
|
||||
"integrity": "sha512-GWkBvjiSZK87ELrYOSESUYeVIc9mvLLf/nXalMOS5dYrgZq9o5OVkbZAVM06CVxYsCwH9BDZFPlQTlPA1j4ahA==",
|
||||
"license": "MIT",
|
||||
"peer": true
|
||||
},
|
||||
"node_modules/events": {
|
||||
"version": "3.3.0",
|
||||
"resolved": "https://registry.npmjs.org/events/-/events-3.3.0.tgz",
|
||||
|
||||
@@ -25,6 +25,7 @@
|
||||
"@aws-sdk/client-transcribe": "^3.549.0",
|
||||
"@azure/storage-blob": "^12.17.0",
|
||||
"@deepgram/sdk": "^1.21.0",
|
||||
"@gladiaio/sdk": "^0.5.2",
|
||||
"@google-cloud/speech": "^6.5.0",
|
||||
"@google-cloud/storage": "^7.9.0",
|
||||
"@jambonz/db-helpers": "^0.9.18",
|
||||
@@ -33,7 +34,7 @@
|
||||
"@jambonz/realtimedb-helpers": "^0.8.15",
|
||||
"@jambonz/speech-utils": "^0.2.25",
|
||||
"@jambonz/time-series": "^0.2.8",
|
||||
"@jambonz/verb-specifications": "^0.0.117",
|
||||
"@jambonz/verb-specifications": "^0.0.118",
|
||||
"@soniox/soniox-node": "^1.2.2",
|
||||
"ajv": "^8.17.1",
|
||||
"argon2": "^0.40.1",
|
||||
|
||||
@@ -970,6 +970,28 @@ test('speech credentials tests', async(t) => {
|
||||
});
|
||||
t.ok(result.statusCode === 204, 'successfully deleted speech credential deepgramflux');
|
||||
|
||||
/* add a credential for gladia */
|
||||
result = await request.post(`/Accounts/${account_sid}/SpeechCredentials`, {
|
||||
resolveWithFullResponse: true,
|
||||
auth: authUser,
|
||||
json: true,
|
||||
body: {
|
||||
vendor: 'gladia',
|
||||
use_for_tts: false,
|
||||
use_for_stt: true,
|
||||
api_key: 'api_key',
|
||||
}
|
||||
});
|
||||
t.ok(result.statusCode === 201, 'successfully added speech credential for Gladia');
|
||||
const gladiaSid = result.body.sid;
|
||||
|
||||
/* delete the credential */
|
||||
result = await request.delete(`/Accounts/${account_sid}/SpeechCredentials/${gladiaSid}`, {
|
||||
auth: authUser,
|
||||
resolveWithFullResponse: true,
|
||||
});
|
||||
t.ok(result.statusCode === 204, 'successfully deleted speech credential for Gladia');
|
||||
|
||||
/* Check google supportedLanguagesAndVoices */
|
||||
result = await request.get(`/Accounts/${account_sid}/SpeechCredentials/speech/supportedLanguagesAndVoices?vendor=google`, {
|
||||
resolveWithFullResponse: true,
|
||||
@@ -1105,6 +1127,15 @@ test('speech credentials tests', async(t) => {
|
||||
t.ok(result.body.tts.length !== 0, 'successfully get whisper supported languages and voices');
|
||||
t.ok(result.body.models.length !== 0, 'successfully get whisper supported languages and voices');
|
||||
|
||||
/* Check gladia supportedLanguagesAndVoices */
|
||||
result = await request.get(`/Accounts/${account_sid}/SpeechCredentials/speech/supportedLanguagesAndVoices?vendor=gladia`, {
|
||||
resolveWithFullResponse: true,
|
||||
simple: false,
|
||||
auth: authAdmin,
|
||||
json: true,
|
||||
});
|
||||
t.ok(result.body.stt.length !== 0, 'successfully get gladia supported languages and voices');
|
||||
|
||||
await deleteObjectBySid(request, '/Accounts', account_sid);
|
||||
await deleteObjectBySid(request, '/ServiceProviders', service_provider_sid);
|
||||
t.end();
|
||||
|
||||
Reference in New Issue
Block a user