support gladia stt (#503)

* support gladia stt

* wip

* update verb specification
This commit is contained in:
Hoan Luu Huu
2025-10-20 15:47:17 +07:00
committed by GitHub
parent 0f1f5e9b73
commit 42f4318a17
6 changed files with 261 additions and 6 deletions

View File

@@ -17,7 +17,8 @@ const {decryptCredential, testWhisper, testDeepgramTTS,
testOpenAiStt,
testInworld,
testResembleTTS,
testHoundifyStt} = require('../../utils/speech-utils');
testHoundifyStt,
testGladiaStt} = require('../../utils/speech-utils');
const {DbErrorUnprocessableRequest, DbErrorForbidden, DbErrorBadRequest} = require('../../utils/errors');
const {
testGoogleTts,
@@ -232,6 +233,10 @@ const encryptCredential = (obj) => {
deepgram_stt_use_tls, deepgram_tts_uri, model_id});
return encrypt(deepgramData);
case 'gladia':
const gladiaData = JSON.stringify({api_key, region});
return encrypt(gladiaData);
case 'resemble':
assert(api_key, 'invalid resemble speech credential: api_key is required');
const resembleData = JSON.stringify({
@@ -835,6 +840,18 @@ router.get('/:sid/test', async(req, res) => {
}
}
}
else if (cred.vendor === 'gladia') {
if (cred.use_for_stt) {
try {
await testGladiaStt(logger, credential);
results.stt.status = 'ok';
SpeechCredential.sttTestResult(sid, true);
} catch (err) {
results.stt = {status: 'fail', reason: err.message};
SpeechCredential.sttTestResult(sid, false);
}
}
}
else if (cred.vendor === 'ibm') {
const {getTtsVoices} = req.app.locals;

View File

@@ -0,0 +1,103 @@
module.exports = [
{ name: 'Afrikaans', value: 'af' },
{ name: 'Albanian', value: 'sq' },
{ name: 'Amharic', value: 'am' },
{ name: 'Arabic', value: 'ar' },
{ name: 'Armenian', value: 'hy' },
{ name: 'Assamese', value: 'as' },
{ name: 'Azerbaijani', value: 'az' },
{ name: 'Bashkir', value: 'ba' },
{ name: 'Basque', value: 'eu' },
{ name: 'Belarusian', value: 'be' },
{ name: 'Bengali', value: 'bn' },
{ name: 'Bosnian', value: 'bs' },
{ name: 'Breton', value: 'br' },
{ name: 'Bulgarian', value: 'bg' },
{ name: 'Cantonese', value: 'yue' },
{ name: 'Catalan', value: 'ca' },
{ name: 'Chinese', value: 'zh' },
{ name: 'Croatian', value: 'hr' },
{ name: 'Czech', value: 'cs' },
{ name: 'Danish', value: 'da' },
{ name: 'Dutch', value: 'nl' },
{ name: 'English', value: 'en' },
{ name: 'Estonian', value: 'et' },
{ name: 'Faroese', value: 'fo' },
{ name: 'Finnish', value: 'fi' },
{ name: 'French', value: 'fr' },
{ name: 'Galician', value: 'gl' },
{ name: 'Georgian', value: 'ka' },
{ name: 'German', value: 'de' },
{ name: 'Greek', value: 'el' },
{ name: 'Gujarati', value: 'gu' },
{ name: 'Haitian Creole', value: 'ht' },
{ name: 'Hausa', value: 'ha' },
{ name: 'Hawaiian', value: 'haw' },
{ name: 'Hebrew', value: 'he' },
{ name: 'Hindi', value: 'hi' },
{ name: 'Hungarian', value: 'hu' },
{ name: 'Icelandic', value: 'is' },
{ name: 'Indonesian', value: 'id' },
{ name: 'Italian', value: 'it' },
{ name: 'Japanese', value: 'ja' },
{ name: 'Javanese', value: 'jw' },
{ name: 'Kannada', value: 'kn' },
{ name: 'Kazakh', value: 'kk' },
{ name: 'Khmer', value: 'km' },
{ name: 'Korean', value: 'ko' },
{ name: 'Lao', value: 'lo' },
{ name: 'Latin', value: 'la' },
{ name: 'Latvian', value: 'lv' },
{ name: 'Lingala', value: 'ln' },
{ name: 'Lithuanian', value: 'lt' },
{ name: 'Luxembourgish', value: 'lb' },
{ name: 'Macedonian', value: 'mk' },
{ name: 'Malagasy', value: 'mg' },
{ name: 'Malay', value: 'ms' },
{ name: 'Malayalam', value: 'ml' },
{ name: 'Maltese', value: 'mt' },
{ name: 'Maori', value: 'mi' },
{ name: 'Marathi', value: 'mr' },
{ name: 'Mongolian', value: 'mn' },
{ name: 'Myanmar', value: 'my' },
{ name: 'Nepali', value: 'ne' },
{ name: 'Norwegian', value: 'no' },
{ name: 'Nynorsk', value: 'nn' },
{ name: 'Occitan', value: 'oc' },
{ name: 'Pashto', value: 'ps' },
{ name: 'Persian', value: 'fa' },
{ name: 'Polish', value: 'pl' },
{ name: 'Portuguese', value: 'pt' },
{ name: 'Punjabi', value: 'pa' },
{ name: 'Romanian', value: 'ro' },
{ name: 'Russian', value: 'ru' },
{ name: 'Sanskrit', value: 'sa' },
{ name: 'Serbian', value: 'sr' },
{ name: 'Shona', value: 'sn' },
{ name: 'Sindhi', value: 'sd' },
{ name: 'Sinhala', value: 'si' },
{ name: 'Slovak', value: 'sk' },
{ name: 'Slovenian', value: 'sl' },
{ name: 'Somali', value: 'so' },
{ name: 'Spanish', value: 'es' },
{ name: 'Sundanese', value: 'su' },
{ name: 'Swahili', value: 'sw' },
{ name: 'Swedish', value: 'sv' },
{ name: 'Tagalog', value: 'tl' },
{ name: 'Tajik', value: 'tg' },
{ name: 'Tamil', value: 'ta' },
{ name: 'Tatar', value: 'tt' },
{ name: 'Telugu', value: 'te' },
{ name: 'Thai', value: 'th' },
{ name: 'Tibetan', value: 'bo' },
{ name: 'Turkish', value: 'tr' },
{ name: 'Turkmen', value: 'tk' },
{ name: 'Ukrainian', value: 'uk' },
{ name: 'Urdu', value: 'ur' },
{ name: 'Uzbek', value: 'uz' },
{ name: 'Vietnamese', value: 'vi' },
{ name: 'Welsh', value: 'cy' },
{ name: 'Wolof', value: 'wo' },
{ name: 'Yiddish', value: 'yi' },
{ name: 'Yoruba', value: 'yo' }
];

View File

@@ -6,6 +6,7 @@ const { SpeechClient } = require('@soniox/soniox-node');
const fs = require('fs');
const { AssemblyAI } = require('assemblyai');
const Houndify = require('houndify');
const { GladiaClient } = require('@gladiaio/sdk');
const {decrypt, obscureKey} = require('./encrypt-decrypt');
const { RealtimeSession } = require('speechmatics');
@@ -50,6 +51,7 @@ const SttHoundifyLanguagesVoices = require('./speech-data/stt-houndify');
const SttVoxistLanguagesVoices = require('./speech-data/stt-voxist');
const SttVerbioLanguagesVoices = require('./speech-data/stt-verbio');
const SttOpenaiLanguagesVoices = require('./speech-data/stt-openai');
const SttGladiaLanguagesVoices = require('./speech-data/stt-gladia');
const SttModelOpenai = require('./speech-data/stt-model-openai');
@@ -170,6 +172,65 @@ const testGoogleStt = async(logger, credentials) => {
}
};
const testGladiaStt = async(logger, credentials) => {
const {api_key} = credentials;
try {
const gladiaClient = new GladiaClient({
apiKey: api_key,
});
const gladiaConfig = {
model: 'solaria-1',
encoding: 'wav/pcm',
sample_rate: 16000,
bit_depth: 16,
channels: 1,
language_config: {
languages: ['en'],
code_switching: false,
},
};
// Start the live session
const liveSession = gladiaClient.liveV2().startSession(gladiaConfig);
// Read the test audio file
const audioBuffer = fs.readFileSync(`${__dirname}/../../data/test_audio.wav`);
// Wait for final transcript
return new Promise((resolve, reject) => {
liveSession.on('message', (message) => {
if (message.type === 'transcript' && message.data.is_final) {
logger.debug(`${message.data.id}: ${message.data.utterance.text}`);
liveSession.stopRecording();
resolve(message.data.utterance.text);
}
});
liveSession.on('error', (error) => {
logger.error({error}, 'Gladia Live STT error');
reject(error);
});
// Send audio in chunks
const chunkSize = 1024;
for (let i = 0; i < audioBuffer.length; i += chunkSize) {
const chunk = audioBuffer.slice(i, i + chunkSize);
liveSession.sendAudio(chunk);
}
// Stop recording after sending all audio
liveSession.stopRecording();
// Set a timeout to prevent hanging
setTimeout(() => {
reject(new Error('Gladia STT test timeout'));
}, 30000); // 30 second timeout
});
} catch (error) {
logger.error({error}, 'Failed to create Gladia Live STT session');
throw error;
}
};
const testDeepgramStt = async(logger, credentials) => {
const {api_key, deepgram_stt_uri, deepgram_stt_use_tls} = credentials;
const deepgram = new Deepgram(api_key, deepgram_stt_uri, deepgram_stt_uri && deepgram_stt_use_tls);
@@ -758,6 +819,10 @@ function decryptCredential(obj, credential, logger, isObscureKey = true) {
const o = JSON.parse(decrypt(credential));
obj.api_key = isObscureKey ? obscureKey(o.api_key) : o.api_key;
}
else if ('gladia' === obj.vendor) {
const o = JSON.parse(decrypt(credential));
obj.api_key = isObscureKey ? obscureKey(o.api_key) : o.api_key;
}
else if ('ibm' === obj.vendor) {
const o = JSON.parse(decrypt(credential));
obj.tts_api_key = isObscureKey ? obscureKey(o.tts_api_key) : o.tts_api_key;
@@ -881,6 +946,8 @@ async function getLanguagesAndVoicesForVendor(logger, vendor, credential, getTts
return await getLanguagesVoicesForNuane(credential, getTtsVoices, logger);
case 'deepgram':
return await getLanguagesVoicesForDeepgram(credential, getTtsVoices, logger);
case 'gladia':
return await getLanguagesVoicesForGladia(credential, getTtsVoices, logger);
case 'ibm':
return await getLanguagesVoicesForIbm(credential, getTtsVoices, logger);
case 'nvidia':
@@ -1052,6 +1119,11 @@ async function getLanguagesVoicesForDeepgram(credential, getTtsVoices, logger) {
TtsModelDeepgram, sttModelDeepgram.sort((a, b) => a.name.localeCompare(b.name)));
}
async function getLanguagesVoicesForGladia(credential, getTtsVoices, logger) {
return tranform(undefined, SttGladiaLanguagesVoices.sort((a, b) => a.name.localeCompare(b.name)),
undefined, undefined);
}
async function getLanguagesVoicesForIbm(credential, getTtsVoices, logger) {
if (credential) {
try {
@@ -1706,6 +1778,7 @@ module.exports = {
testNuanceTts,
testNuanceStt,
testDeepgramStt,
testGladiaStt,
testIbmTts,
testIbmStt,
testSonioxStt,

38
package-lock.json generated
View File

@@ -14,6 +14,7 @@
"@aws-sdk/client-transcribe": "^3.549.0",
"@azure/storage-blob": "^12.17.0",
"@deepgram/sdk": "^1.21.0",
"@gladiaio/sdk": "^0.5.2",
"@google-cloud/speech": "^6.5.0",
"@google-cloud/storage": "^7.9.0",
"@jambonz/db-helpers": "^0.9.18",
@@ -22,7 +23,7 @@
"@jambonz/realtimedb-helpers": "^0.8.15",
"@jambonz/speech-utils": "^0.2.25",
"@jambonz/time-series": "^0.2.8",
"@jambonz/verb-specifications": "^0.0.117",
"@jambonz/verb-specifications": "^0.0.118",
"@soniox/soniox-node": "^1.2.2",
"ajv": "^8.17.1",
"argon2": "^0.40.1",
@@ -3793,6 +3794,28 @@
"node": "^12.22.0 || ^14.17.0 || >=16.0.0"
}
},
"node_modules/@gladiaio/sdk": {
"version": "0.5.2",
"resolved": "https://registry.npmjs.org/@gladiaio/sdk/-/sdk-0.5.2.tgz",
"integrity": "sha512-v51y75+5Wg/YWVQlTXGdOFiKTrYAHdWR3xZoCqLQdWPyBlynVbiTPf4IpID0zAP/ngu16GGmBVC/XOQZfcj+hg==",
"license": "MIT",
"engines": {
"node": ">=20"
},
"peerDependencies": {
"eventemitter3": ">=5",
"undici": ">=5",
"ws": "5.2.4 || 6.2.3 || 7.5.10 || >=8.8"
},
"peerDependenciesMeta": {
"undici": {
"optional": true
},
"ws": {
"optional": true
}
}
},
"node_modules/@google-cloud/common": {
"version": "5.0.1",
"resolved": "https://registry.npmjs.org/@google-cloud/common/-/common-5.0.1.tgz",
@@ -4323,9 +4346,9 @@
}
},
"node_modules/@jambonz/verb-specifications": {
"version": "0.0.117",
"resolved": "https://registry.npmjs.org/@jambonz/verb-specifications/-/verb-specifications-0.0.117.tgz",
"integrity": "sha512-yfnHWfqVRyE9ICBdVQV8CFwj0jDInpgMYdv9lI5c4iOmnYrJU3e/iFdrwM+abhFyZGL1Can9onynPW6RZFXXsw==",
"version": "0.0.118",
"resolved": "https://registry.npmjs.org/@jambonz/verb-specifications/-/verb-specifications-0.0.118.tgz",
"integrity": "sha512-1dGnc6TUCehjt1yGNuqh1uzk1xw9HhUm39aVUosQMHlnT0fK0ItikeJ0uttTjFastHNmPPxqJwb20wOvVGTCFg==",
"license": "MIT",
"dependencies": {
"debug": "^4.3.4",
@@ -7356,6 +7379,13 @@
"node": ">=6"
}
},
"node_modules/eventemitter3": {
"version": "5.0.1",
"resolved": "https://registry.npmjs.org/eventemitter3/-/eventemitter3-5.0.1.tgz",
"integrity": "sha512-GWkBvjiSZK87ELrYOSESUYeVIc9mvLLf/nXalMOS5dYrgZq9o5OVkbZAVM06CVxYsCwH9BDZFPlQTlPA1j4ahA==",
"license": "MIT",
"peer": true
},
"node_modules/events": {
"version": "3.3.0",
"resolved": "https://registry.npmjs.org/events/-/events-3.3.0.tgz",

View File

@@ -25,6 +25,7 @@
"@aws-sdk/client-transcribe": "^3.549.0",
"@azure/storage-blob": "^12.17.0",
"@deepgram/sdk": "^1.21.0",
"@gladiaio/sdk": "^0.5.2",
"@google-cloud/speech": "^6.5.0",
"@google-cloud/storage": "^7.9.0",
"@jambonz/db-helpers": "^0.9.18",
@@ -33,7 +34,7 @@
"@jambonz/realtimedb-helpers": "^0.8.15",
"@jambonz/speech-utils": "^0.2.25",
"@jambonz/time-series": "^0.2.8",
"@jambonz/verb-specifications": "^0.0.117",
"@jambonz/verb-specifications": "^0.0.118",
"@soniox/soniox-node": "^1.2.2",
"ajv": "^8.17.1",
"argon2": "^0.40.1",

View File

@@ -970,6 +970,28 @@ test('speech credentials tests', async(t) => {
});
t.ok(result.statusCode === 204, 'successfully deleted speech credential deepgramflux');
/* add a credential for gladia */
result = await request.post(`/Accounts/${account_sid}/SpeechCredentials`, {
resolveWithFullResponse: true,
auth: authUser,
json: true,
body: {
vendor: 'gladia',
use_for_tts: false,
use_for_stt: true,
api_key: 'api_key',
}
});
t.ok(result.statusCode === 201, 'successfully added speech credential for Gladia');
const gladiaSid = result.body.sid;
/* delete the credential */
result = await request.delete(`/Accounts/${account_sid}/SpeechCredentials/${gladiaSid}`, {
auth: authUser,
resolveWithFullResponse: true,
});
t.ok(result.statusCode === 204, 'successfully deleted speech credential for Gladia');
/* Check google supportedLanguagesAndVoices */
result = await request.get(`/Accounts/${account_sid}/SpeechCredentials/speech/supportedLanguagesAndVoices?vendor=google`, {
resolveWithFullResponse: true,
@@ -1105,6 +1127,15 @@ test('speech credentials tests', async(t) => {
t.ok(result.body.tts.length !== 0, 'successfully get whisper supported languages and voices');
t.ok(result.body.models.length !== 0, 'successfully get whisper supported languages and voices');
/* Check gladia supportedLanguagesAndVoices */
result = await request.get(`/Accounts/${account_sid}/SpeechCredentials/speech/supportedLanguagesAndVoices?vendor=gladia`, {
resolveWithFullResponse: true,
simple: false,
auth: authAdmin,
json: true,
});
t.ok(result.body.stt.length !== 0, 'successfully get gladia supported languages and voices');
await deleteObjectBySid(request, '/Accounts', account_sid);
await deleteObjectBySid(request, '/ServiceProviders', service_provider_sid);
t.end();