diff --git a/lib/routes/api/speech-credentials.js b/lib/routes/api/speech-credentials.js index a9ce33c..ead1e2f 100644 --- a/lib/routes/api/speech-credentials.js +++ b/lib/routes/api/speech-credentials.js @@ -17,7 +17,8 @@ const {decryptCredential, testWhisper, testDeepgramTTS, testOpenAiStt, testInworld, testResembleTTS, - testHoundifyStt} = require('../../utils/speech-utils'); + testHoundifyStt, + testGladiaStt} = require('../../utils/speech-utils'); const {DbErrorUnprocessableRequest, DbErrorForbidden, DbErrorBadRequest} = require('../../utils/errors'); const { testGoogleTts, @@ -232,6 +233,10 @@ const encryptCredential = (obj) => { deepgram_stt_use_tls, deepgram_tts_uri, model_id}); return encrypt(deepgramData); + case 'gladia': + const gladiaData = JSON.stringify({api_key, region}); + return encrypt(gladiaData); + case 'resemble': assert(api_key, 'invalid resemble speech credential: api_key is required'); const resembleData = JSON.stringify({ @@ -835,6 +840,18 @@ router.get('/:sid/test', async(req, res) => { } } } + else if (cred.vendor === 'gladia') { + if (cred.use_for_stt) { + try { + await testGladiaStt(logger, credential); + results.stt.status = 'ok'; + SpeechCredential.sttTestResult(sid, true); + } catch (err) { + results.stt = {status: 'fail', reason: err.message}; + SpeechCredential.sttTestResult(sid, false); + } + } + } else if (cred.vendor === 'ibm') { const {getTtsVoices} = req.app.locals; diff --git a/lib/utils/speech-data/stt-gladia.js b/lib/utils/speech-data/stt-gladia.js new file mode 100644 index 0000000..7fe03c8 --- /dev/null +++ b/lib/utils/speech-data/stt-gladia.js @@ -0,0 +1,103 @@ +module.exports = [ + { name: 'Afrikaans', value: 'af' }, + { name: 'Albanian', value: 'sq' }, + { name: 'Amharic', value: 'am' }, + { name: 'Arabic', value: 'ar' }, + { name: 'Armenian', value: 'hy' }, + { name: 'Assamese', value: 'as' }, + { name: 'Azerbaijani', value: 'az' }, + { name: 'Bashkir', value: 'ba' }, + { name: 'Basque', value: 'eu' }, + { name: 'Belarusian', value: 'be' }, + { name: 'Bengali', value: 'bn' }, + { name: 'Bosnian', value: 'bs' }, + { name: 'Breton', value: 'br' }, + { name: 'Bulgarian', value: 'bg' }, + { name: 'Cantonese', value: 'yue' }, + { name: 'Catalan', value: 'ca' }, + { name: 'Chinese', value: 'zh' }, + { name: 'Croatian', value: 'hr' }, + { name: 'Czech', value: 'cs' }, + { name: 'Danish', value: 'da' }, + { name: 'Dutch', value: 'nl' }, + { name: 'English', value: 'en' }, + { name: 'Estonian', value: 'et' }, + { name: 'Faroese', value: 'fo' }, + { name: 'Finnish', value: 'fi' }, + { name: 'French', value: 'fr' }, + { name: 'Galician', value: 'gl' }, + { name: 'Georgian', value: 'ka' }, + { name: 'German', value: 'de' }, + { name: 'Greek', value: 'el' }, + { name: 'Gujarati', value: 'gu' }, + { name: 'Haitian Creole', value: 'ht' }, + { name: 'Hausa', value: 'ha' }, + { name: 'Hawaiian', value: 'haw' }, + { name: 'Hebrew', value: 'he' }, + { name: 'Hindi', value: 'hi' }, + { name: 'Hungarian', value: 'hu' }, + { name: 'Icelandic', value: 'is' }, + { name: 'Indonesian', value: 'id' }, + { name: 'Italian', value: 'it' }, + { name: 'Japanese', value: 'ja' }, + { name: 'Javanese', value: 'jw' }, + { name: 'Kannada', value: 'kn' }, + { name: 'Kazakh', value: 'kk' }, + { name: 'Khmer', value: 'km' }, + { name: 'Korean', value: 'ko' }, + { name: 'Lao', value: 'lo' }, + { name: 'Latin', value: 'la' }, + { name: 'Latvian', value: 'lv' }, + { name: 'Lingala', value: 'ln' }, + { name: 'Lithuanian', value: 'lt' }, + { name: 'Luxembourgish', value: 'lb' }, + { name: 'Macedonian', value: 'mk' }, + { name: 'Malagasy', value: 'mg' }, + { name: 'Malay', value: 'ms' }, + { name: 'Malayalam', value: 'ml' }, + { name: 'Maltese', value: 'mt' }, + { name: 'Maori', value: 'mi' }, + { name: 'Marathi', value: 'mr' }, + { name: 'Mongolian', value: 'mn' }, + { name: 'Myanmar', value: 'my' }, + { name: 'Nepali', value: 'ne' }, + { name: 'Norwegian', value: 'no' }, + { name: 'Nynorsk', value: 'nn' }, + { name: 'Occitan', value: 'oc' }, + { name: 'Pashto', value: 'ps' }, + { name: 'Persian', value: 'fa' }, + { name: 'Polish', value: 'pl' }, + { name: 'Portuguese', value: 'pt' }, + { name: 'Punjabi', value: 'pa' }, + { name: 'Romanian', value: 'ro' }, + { name: 'Russian', value: 'ru' }, + { name: 'Sanskrit', value: 'sa' }, + { name: 'Serbian', value: 'sr' }, + { name: 'Shona', value: 'sn' }, + { name: 'Sindhi', value: 'sd' }, + { name: 'Sinhala', value: 'si' }, + { name: 'Slovak', value: 'sk' }, + { name: 'Slovenian', value: 'sl' }, + { name: 'Somali', value: 'so' }, + { name: 'Spanish', value: 'es' }, + { name: 'Sundanese', value: 'su' }, + { name: 'Swahili', value: 'sw' }, + { name: 'Swedish', value: 'sv' }, + { name: 'Tagalog', value: 'tl' }, + { name: 'Tajik', value: 'tg' }, + { name: 'Tamil', value: 'ta' }, + { name: 'Tatar', value: 'tt' }, + { name: 'Telugu', value: 'te' }, + { name: 'Thai', value: 'th' }, + { name: 'Tibetan', value: 'bo' }, + { name: 'Turkish', value: 'tr' }, + { name: 'Turkmen', value: 'tk' }, + { name: 'Ukrainian', value: 'uk' }, + { name: 'Urdu', value: 'ur' }, + { name: 'Uzbek', value: 'uz' }, + { name: 'Vietnamese', value: 'vi' }, + { name: 'Welsh', value: 'cy' }, + { name: 'Wolof', value: 'wo' }, + { name: 'Yiddish', value: 'yi' }, + { name: 'Yoruba', value: 'yo' } +]; diff --git a/lib/utils/speech-utils.js b/lib/utils/speech-utils.js index 8c4c318..e27eaa2 100644 --- a/lib/utils/speech-utils.js +++ b/lib/utils/speech-utils.js @@ -6,6 +6,7 @@ const { SpeechClient } = require('@soniox/soniox-node'); const fs = require('fs'); const { AssemblyAI } = require('assemblyai'); const Houndify = require('houndify'); +const { GladiaClient } = require('@gladiaio/sdk'); const {decrypt, obscureKey} = require('./encrypt-decrypt'); const { RealtimeSession } = require('speechmatics'); @@ -50,6 +51,7 @@ const SttHoundifyLanguagesVoices = require('./speech-data/stt-houndify'); const SttVoxistLanguagesVoices = require('./speech-data/stt-voxist'); const SttVerbioLanguagesVoices = require('./speech-data/stt-verbio'); const SttOpenaiLanguagesVoices = require('./speech-data/stt-openai'); +const SttGladiaLanguagesVoices = require('./speech-data/stt-gladia'); const SttModelOpenai = require('./speech-data/stt-model-openai'); @@ -170,6 +172,65 @@ const testGoogleStt = async(logger, credentials) => { } }; +const testGladiaStt = async(logger, credentials) => { + const {api_key} = credentials; + + try { + const gladiaClient = new GladiaClient({ + apiKey: api_key, + }); + const gladiaConfig = { + model: 'solaria-1', + encoding: 'wav/pcm', + sample_rate: 16000, + bit_depth: 16, + channels: 1, + language_config: { + languages: ['en'], + code_switching: false, + }, + }; + // Start the live session + const liveSession = gladiaClient.liveV2().startSession(gladiaConfig); + // Read the test audio file + const audioBuffer = fs.readFileSync(`${__dirname}/../../data/test_audio.wav`); + + // Wait for final transcript + return new Promise((resolve, reject) => { + liveSession.on('message', (message) => { + if (message.type === 'transcript' && message.data.is_final) { + logger.debug(`${message.data.id}: ${message.data.utterance.text}`); + liveSession.stopRecording(); + resolve(message.data.utterance.text); + } + }); + + liveSession.on('error', (error) => { + logger.error({error}, 'Gladia Live STT error'); + reject(error); + }); + + // Send audio in chunks + const chunkSize = 1024; + for (let i = 0; i < audioBuffer.length; i += chunkSize) { + const chunk = audioBuffer.slice(i, i + chunkSize); + liveSession.sendAudio(chunk); + } + // Stop recording after sending all audio + liveSession.stopRecording(); + + // Set a timeout to prevent hanging + setTimeout(() => { + reject(new Error('Gladia STT test timeout')); + }, 30000); // 30 second timeout + }); + + } catch (error) { + logger.error({error}, 'Failed to create Gladia Live STT session'); + throw error; + } +}; + const testDeepgramStt = async(logger, credentials) => { const {api_key, deepgram_stt_uri, deepgram_stt_use_tls} = credentials; const deepgram = new Deepgram(api_key, deepgram_stt_uri, deepgram_stt_uri && deepgram_stt_use_tls); @@ -758,6 +819,10 @@ function decryptCredential(obj, credential, logger, isObscureKey = true) { const o = JSON.parse(decrypt(credential)); obj.api_key = isObscureKey ? obscureKey(o.api_key) : o.api_key; } + else if ('gladia' === obj.vendor) { + const o = JSON.parse(decrypt(credential)); + obj.api_key = isObscureKey ? obscureKey(o.api_key) : o.api_key; + } else if ('ibm' === obj.vendor) { const o = JSON.parse(decrypt(credential)); obj.tts_api_key = isObscureKey ? obscureKey(o.tts_api_key) : o.tts_api_key; @@ -881,6 +946,8 @@ async function getLanguagesAndVoicesForVendor(logger, vendor, credential, getTts return await getLanguagesVoicesForNuane(credential, getTtsVoices, logger); case 'deepgram': return await getLanguagesVoicesForDeepgram(credential, getTtsVoices, logger); + case 'gladia': + return await getLanguagesVoicesForGladia(credential, getTtsVoices, logger); case 'ibm': return await getLanguagesVoicesForIbm(credential, getTtsVoices, logger); case 'nvidia': @@ -1052,6 +1119,11 @@ async function getLanguagesVoicesForDeepgram(credential, getTtsVoices, logger) { TtsModelDeepgram, sttModelDeepgram.sort((a, b) => a.name.localeCompare(b.name))); } +async function getLanguagesVoicesForGladia(credential, getTtsVoices, logger) { + return tranform(undefined, SttGladiaLanguagesVoices.sort((a, b) => a.name.localeCompare(b.name)), + undefined, undefined); +} + async function getLanguagesVoicesForIbm(credential, getTtsVoices, logger) { if (credential) { try { @@ -1706,6 +1778,7 @@ module.exports = { testNuanceTts, testNuanceStt, testDeepgramStt, + testGladiaStt, testIbmTts, testIbmStt, testSonioxStt, diff --git a/package-lock.json b/package-lock.json index 6207b06..f4c7dcd 100644 --- a/package-lock.json +++ b/package-lock.json @@ -14,6 +14,7 @@ "@aws-sdk/client-transcribe": "^3.549.0", "@azure/storage-blob": "^12.17.0", "@deepgram/sdk": "^1.21.0", + "@gladiaio/sdk": "^0.5.2", "@google-cloud/speech": "^6.5.0", "@google-cloud/storage": "^7.9.0", "@jambonz/db-helpers": "^0.9.18", @@ -22,7 +23,7 @@ "@jambonz/realtimedb-helpers": "^0.8.15", "@jambonz/speech-utils": "^0.2.25", "@jambonz/time-series": "^0.2.8", - "@jambonz/verb-specifications": "^0.0.117", + "@jambonz/verb-specifications": "^0.0.118", "@soniox/soniox-node": "^1.2.2", "ajv": "^8.17.1", "argon2": "^0.40.1", @@ -3793,6 +3794,28 @@ "node": "^12.22.0 || ^14.17.0 || >=16.0.0" } }, + "node_modules/@gladiaio/sdk": { + "version": "0.5.2", + "resolved": "https://registry.npmjs.org/@gladiaio/sdk/-/sdk-0.5.2.tgz", + "integrity": "sha512-v51y75+5Wg/YWVQlTXGdOFiKTrYAHdWR3xZoCqLQdWPyBlynVbiTPf4IpID0zAP/ngu16GGmBVC/XOQZfcj+hg==", + "license": "MIT", + "engines": { + "node": ">=20" + }, + "peerDependencies": { + "eventemitter3": ">=5", + "undici": ">=5", + "ws": "5.2.4 || 6.2.3 || 7.5.10 || >=8.8" + }, + "peerDependenciesMeta": { + "undici": { + "optional": true + }, + "ws": { + "optional": true + } + } + }, "node_modules/@google-cloud/common": { "version": "5.0.1", "resolved": "https://registry.npmjs.org/@google-cloud/common/-/common-5.0.1.tgz", @@ -4323,9 +4346,9 @@ } }, "node_modules/@jambonz/verb-specifications": { - "version": "0.0.117", - "resolved": "https://registry.npmjs.org/@jambonz/verb-specifications/-/verb-specifications-0.0.117.tgz", - "integrity": "sha512-yfnHWfqVRyE9ICBdVQV8CFwj0jDInpgMYdv9lI5c4iOmnYrJU3e/iFdrwM+abhFyZGL1Can9onynPW6RZFXXsw==", + "version": "0.0.118", + "resolved": "https://registry.npmjs.org/@jambonz/verb-specifications/-/verb-specifications-0.0.118.tgz", + "integrity": "sha512-1dGnc6TUCehjt1yGNuqh1uzk1xw9HhUm39aVUosQMHlnT0fK0ItikeJ0uttTjFastHNmPPxqJwb20wOvVGTCFg==", "license": "MIT", "dependencies": { "debug": "^4.3.4", @@ -7356,6 +7379,13 @@ "node": ">=6" } }, + "node_modules/eventemitter3": { + "version": "5.0.1", + "resolved": "https://registry.npmjs.org/eventemitter3/-/eventemitter3-5.0.1.tgz", + "integrity": "sha512-GWkBvjiSZK87ELrYOSESUYeVIc9mvLLf/nXalMOS5dYrgZq9o5OVkbZAVM06CVxYsCwH9BDZFPlQTlPA1j4ahA==", + "license": "MIT", + "peer": true + }, "node_modules/events": { "version": "3.3.0", "resolved": "https://registry.npmjs.org/events/-/events-3.3.0.tgz", diff --git a/package.json b/package.json index 49e7430..7c8a569 100644 --- a/package.json +++ b/package.json @@ -25,6 +25,7 @@ "@aws-sdk/client-transcribe": "^3.549.0", "@azure/storage-blob": "^12.17.0", "@deepgram/sdk": "^1.21.0", + "@gladiaio/sdk": "^0.5.2", "@google-cloud/speech": "^6.5.0", "@google-cloud/storage": "^7.9.0", "@jambonz/db-helpers": "^0.9.18", @@ -33,7 +34,7 @@ "@jambonz/realtimedb-helpers": "^0.8.15", "@jambonz/speech-utils": "^0.2.25", "@jambonz/time-series": "^0.2.8", - "@jambonz/verb-specifications": "^0.0.117", + "@jambonz/verb-specifications": "^0.0.118", "@soniox/soniox-node": "^1.2.2", "ajv": "^8.17.1", "argon2": "^0.40.1", diff --git a/test/speech-credentials.js b/test/speech-credentials.js index 856b3f1..d642dcb 100644 --- a/test/speech-credentials.js +++ b/test/speech-credentials.js @@ -970,6 +970,28 @@ test('speech credentials tests', async(t) => { }); t.ok(result.statusCode === 204, 'successfully deleted speech credential deepgramflux'); + /* add a credential for gladia */ + result = await request.post(`/Accounts/${account_sid}/SpeechCredentials`, { + resolveWithFullResponse: true, + auth: authUser, + json: true, + body: { + vendor: 'gladia', + use_for_tts: false, + use_for_stt: true, + api_key: 'api_key', + } + }); + t.ok(result.statusCode === 201, 'successfully added speech credential for Gladia'); + const gladiaSid = result.body.sid; + + /* delete the credential */ + result = await request.delete(`/Accounts/${account_sid}/SpeechCredentials/${gladiaSid}`, { + auth: authUser, + resolveWithFullResponse: true, + }); + t.ok(result.statusCode === 204, 'successfully deleted speech credential for Gladia'); + /* Check google supportedLanguagesAndVoices */ result = await request.get(`/Accounts/${account_sid}/SpeechCredentials/speech/supportedLanguagesAndVoices?vendor=google`, { resolveWithFullResponse: true, @@ -1105,6 +1127,15 @@ test('speech credentials tests', async(t) => { t.ok(result.body.tts.length !== 0, 'successfully get whisper supported languages and voices'); t.ok(result.body.models.length !== 0, 'successfully get whisper supported languages and voices'); + /* Check gladia supportedLanguagesAndVoices */ + result = await request.get(`/Accounts/${account_sid}/SpeechCredentials/speech/supportedLanguagesAndVoices?vendor=gladia`, { + resolveWithFullResponse: true, + simple: false, + auth: authAdmin, + json: true, + }); + t.ok(result.body.stt.length !== 0, 'successfully get gladia supported languages and voices'); + await deleteObjectBySid(request, '/Accounts', account_sid); await deleteObjectBySid(request, '/ServiceProviders', service_provider_sid); t.end();