From 2d2b98dab585b175e6d6abec0ef6c1354a0d43e9 Mon Sep 17 00:00:00 2001 From: Hoan Luu Huu <110280845+xquanluu@users.noreply.github.com> Date: Wed, 7 Aug 2024 18:24:58 +0700 Subject: [PATCH] Feat/deepgram tts onprem (#338) * support deepgram onpremise * wip * update speech utils version * install docker in ci --- .github/workflows/ci.yml | 5 ++ lib/routes/api/speech-credentials.js | 7 ++- lib/utils/speech-utils.js | 20 +++++--- package-lock.json | 68 ++++++++++++++++++++++++++-- package.json | 2 +- test/speech-credentials.js | 8 +++- 6 files changed, 93 insertions(+), 17 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 9a06896..1abbc3e 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -7,6 +7,11 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 + - name: Install Docker Compose + run: | + sudo curl -L "https://github.com/docker/compose/releases/download/1.29.2/docker-compose-$(uname -s)-$(uname -m)" -o /usr/local/bin/docker-compose + sudo chmod +x /usr/local/bin/docker-compose + docker-compose --version - uses: actions/setup-node@v3 with: node-version: lts/* diff --git a/lib/routes/api/speech-credentials.js b/lib/routes/api/speech-credentials.js index a8bdc6b..02dbb30 100644 --- a/lib/routes/api/speech-credentials.js +++ b/lib/routes/api/speech-credentials.js @@ -124,6 +124,7 @@ const encryptCredential = (obj) => { nuance_stt_uri, deepgram_stt_uri, deepgram_stt_use_tls, + deepgram_tts_uri, use_custom_tts, custom_tts_endpoint, custom_tts_endpoint_url, @@ -204,10 +205,10 @@ const encryptCredential = (obj) => { case 'deepgram': // API key is optional if onprem - if (!deepgram_stt_uri) { + if (!deepgram_stt_uri || !deepgram_tts_uri) { assert(api_key, 'invalid deepgram speech credential: api_key is required'); } - const deepgramData = JSON.stringify({api_key, deepgram_stt_uri, deepgram_stt_use_tls}); + const deepgramData = JSON.stringify({api_key, deepgram_stt_uri, deepgram_stt_use_tls, deepgram_tts_uri}); return encrypt(deepgramData); case 'ibm': @@ -458,6 +459,7 @@ router.put('/:sid', async(req, res) => { options, deepgram_stt_uri, deepgram_stt_use_tls, + deepgram_tts_uri, engine_version } = req.body; @@ -485,6 +487,7 @@ router.put('/:sid', async(req, res) => { options, deepgram_stt_uri, deepgram_stt_use_tls, + deepgram_tts_uri, engine_version }; logger.info({o, newCred}, 'updating speech credential with this new credential'); diff --git a/lib/utils/speech-utils.js b/lib/utils/speech-utils.js index 096ed25..97352e2 100644 --- a/lib/utils/speech-utils.js +++ b/lib/utils/speech-utils.js @@ -92,8 +92,8 @@ const testGoogleStt = async(logger, credentials) => { }; const testDeepgramStt = async(logger, credentials) => { - const {api_key} = credentials; - const deepgram = new Deepgram(api_key); + const {api_key, deepgram_stt_uri, deepgram_stt_use_tls} = credentials; + const deepgram = new Deepgram(api_key, deepgram_stt_uri, deepgram_stt_uri && deepgram_stt_use_tls); const mimetype = 'audio/wav'; const source = { @@ -272,7 +272,8 @@ const testPlayHT = async(logger, synthAudio, credentials) => { credentials, language: 'en-US', voice: 's3://voice-cloning-zero-shot/d9ff78ba-d016-47f6-b0ef-dd630f59414e/female-cs/manifest.json', - text: 'Hi there and welcome to jambones!' + text: 'Hi there and welcome to jambones!', + renderForCaching: true } ); // Test if playHT can fetch voices @@ -295,7 +296,8 @@ const testRimelabs = async(logger, synthAudio, credentials) => { credentials, language: 'en-US', voice: 'amber', - text: 'Hi there and welcome to jambones!' + text: 'Hi there and welcome to jambones!', + renderForCaching: true } ); } catch (err) { @@ -312,7 +314,8 @@ const testWhisper = async(logger, synthAudio, credentials) => { credentials, language: 'en-US', voice: 'alloy', - text: 'Hi there and welcome to jambones!' + text: 'Hi there and welcome to jambones!', + renderForCaching: true } ); } catch (err) { @@ -328,7 +331,8 @@ const testDeepgramTTS = async(logger, synthAudio, credentials) => { vendor: 'deepgram', credentials, model: 'aura-asteria-en', - text: 'Hi there and welcome to jambones!' + text: 'Hi there and welcome to jambones!', + renderForCaching: true } ); } catch (err) { @@ -383,7 +387,8 @@ const testVerbioTts = async(logger, synthAudio, credentials) => { credentials, language: 'en-US', voice: 'tommy_en-us', - text: 'Hi there and welcome to jambones!' + text: 'Hi there and welcome to jambones!', + renderForCaching: true } ); } catch (err) { @@ -509,6 +514,7 @@ function decryptCredential(obj, credential, logger, isObscureKey = true) { obj.api_key = isObscureKey ? obscureKey(o.api_key) : o.api_key; obj.deepgram_stt_uri = o.deepgram_stt_uri; obj.deepgram_stt_use_tls = o.deepgram_stt_use_tls; + obj.deepgram_tts_uri = o.deepgram_tts_uri; } else if ('ibm' === obj.vendor) { const o = JSON.parse(decrypt(credential)); diff --git a/package-lock.json b/package-lock.json index 6804e30..d309b97 100644 --- a/package-lock.json +++ b/package-lock.json @@ -19,7 +19,7 @@ "@jambonz/lamejs": "^1.2.2", "@jambonz/mw-registrar": "^0.2.7", "@jambonz/realtimedb-helpers": "^0.8.9", - "@jambonz/speech-utils": "^0.1.11", + "@jambonz/speech-utils": "^0.1.13", "@jambonz/time-series": "^0.2.8", "@jambonz/verb-specifications": "^0.0.72", "@soniox/soniox-node": "^1.2.2", @@ -2027,9 +2027,10 @@ } }, "node_modules/@jambonz/speech-utils": { - "version": "0.1.11", - "resolved": "https://registry.npmjs.org/@jambonz/speech-utils/-/speech-utils-0.1.11.tgz", - "integrity": "sha512-VgljBLUF871adib/3yWpzd7kv26ioxiLVkAIxm94CSk9WeZuzX1lVcE2SohojW3mjCYdYY6+B8FRyzlTD+en3g==", + "version": "0.1.13", + "resolved": "https://registry.npmjs.org/@jambonz/speech-utils/-/speech-utils-0.1.13.tgz", + "integrity": "sha512-QeVmNFLtJGPGQfmp7jXpy742AyJIv2EteelDmNTqWGFEwTBj88q8GLP51hUsIR2ZbE5n/ZmZb/ytT6Y6LIQSDg==", + "license": "MIT", "dependencies": { "@aws-sdk/client-polly": "^3.496.0", "@aws-sdk/client-sts": "^3.496.0", @@ -2041,7 +2042,7 @@ "form-urlencoded": "^6.1.4", "google-protobuf": "^3.21.2", "ibm-watson": "^8.0.0", - "microsoft-cognitiveservices-speech-sdk": "1.36.0", + "microsoft-cognitiveservices-speech-sdk": "1.38.0", "openai": "^4.25.0", "undici": "^6.4.0" } @@ -2051,6 +2052,28 @@ "resolved": "https://registry.npmjs.org/@types/node/-/node-13.13.52.tgz", "integrity": "sha512-s3nugnZumCC//n4moGGe6tkNMyYEdaDBitVjwPxXmR5lnMG5dHePinH2EdxkG3Rh1ghFHHixAG4NJhpJW1rthQ==" }, + "node_modules/@jambonz/speech-utils/node_modules/https-proxy-agent": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-4.0.0.tgz", + "integrity": "sha512-zoDhWrkR3of1l9QAL8/scJZyLu8j/gBkcwcaQOZh7Gyh/+uJQzGVETdgT30akuwkpL8HTRfssqI3BZuV18teDg==", + "license": "MIT", + "dependencies": { + "agent-base": "5", + "debug": "4" + }, + "engines": { + "node": ">= 6.0.0" + } + }, + "node_modules/@jambonz/speech-utils/node_modules/https-proxy-agent/node_modules/agent-base": { + "version": "5.1.1", + "resolved": "https://registry.npmjs.org/agent-base/-/agent-base-5.1.1.tgz", + "integrity": "sha512-TMeqbNl2fMW0nMjTEPOwe3J/PRFP4vqeoNuQMG0HlMrtm5QxKqdvAkZ1pRBQ/ulIyDD5Yq0nJ7YbdD8ey0TO3g==", + "license": "MIT", + "engines": { + "node": ">= 6.0.0" + } + }, "node_modules/@jambonz/speech-utils/node_modules/ibm-watson": { "version": "8.0.0", "resolved": "https://registry.npmjs.org/ibm-watson/-/ibm-watson-8.0.0.tgz", @@ -2072,6 +2095,41 @@ "node": ">=16.0.0" } }, + "node_modules/@jambonz/speech-utils/node_modules/microsoft-cognitiveservices-speech-sdk": { + "version": "1.38.0", + "resolved": "https://registry.npmjs.org/microsoft-cognitiveservices-speech-sdk/-/microsoft-cognitiveservices-speech-sdk-1.38.0.tgz", + "integrity": "sha512-NA6J4eIDkeR9iN83rcn77Kn5AWQcizDEn1tLMjzRvSovUNB1FrZe0mWYO0fsGltUwMl3Ns5OZ3lGw42PU4fEYA==", + "license": "MIT", + "dependencies": { + "@types/webrtc": "^0.0.37", + "agent-base": "^6.0.1", + "bent": "^7.3.12", + "https-proxy-agent": "^4.0.0", + "uuid": "^9.0.0", + "ws": "^7.5.6" + } + }, + "node_modules/@jambonz/speech-utils/node_modules/ws": { + "version": "7.5.10", + "resolved": "https://registry.npmjs.org/ws/-/ws-7.5.10.tgz", + "integrity": "sha512-+dbF1tHwZpXcbOJdVOkzLDxZP1ailvSxM6ZweXTegylPny803bFhA+vqBYw4s31NSAk4S2Qz+AKXK9a4wkdjcQ==", + "license": "MIT", + "engines": { + "node": ">=8.3.0" + }, + "peerDependencies": { + "bufferutil": "^4.0.1", + "utf-8-validate": "^5.0.2" + }, + "peerDependenciesMeta": { + "bufferutil": { + "optional": true + }, + "utf-8-validate": { + "optional": true + } + } + }, "node_modules/@jambonz/time-series": { "version": "0.2.8", "resolved": "https://registry.npmjs.org/@jambonz/time-series/-/time-series-0.2.8.tgz", diff --git a/package.json b/package.json index 8db0e63..d5edc8e 100644 --- a/package.json +++ b/package.json @@ -29,7 +29,7 @@ "@jambonz/lamejs": "^1.2.2", "@jambonz/mw-registrar": "^0.2.7", "@jambonz/realtimedb-helpers": "^0.8.9", - "@jambonz/speech-utils": "^0.1.11", + "@jambonz/speech-utils": "^0.1.13", "@jambonz/time-series": "^0.2.8", "@jambonz/verb-specifications": "^0.0.72", "@soniox/soniox-node": "^1.2.2", diff --git a/test/speech-credentials.js b/test/speech-credentials.js index f650212..c5a0752 100644 --- a/test/speech-credentials.js +++ b/test/speech-credentials.js @@ -371,7 +371,8 @@ test('speech credentials tests', async(t) => { vendor: 'deepgram', use_for_stt: true, deepgram_stt_uri: "127.0.0.1:50002", - deepgram_stt_use_tls: true + deepgram_stt_use_tls: true, + deepgram_tts_uri: 'https://server.com' } }); t.ok(result.statusCode === 201, 'successfully added speech credential for deepgram'); @@ -386,6 +387,7 @@ test('speech credentials tests', async(t) => { t.ok(result.statusCode === 200, 'successfully get speech credential for deepgram'); t.ok(result.body.deepgram_stt_uri === '127.0.0.1:50002', "deepgram_stt_uri is correct for deepgram"); t.ok(result.body.deepgram_stt_use_tls === true, "deepgram_stt_use_tls is correct for deepgram"); + t.ok(result.body.deepgram_tts_uri === 'https://server.com', "deepgram_tts_uri is correct for deepgram") result = await request.put(`/Accounts/${account_sid}/SpeechCredentials/${dg_sid}`, { resolveWithFullResponse: true, @@ -395,7 +397,8 @@ test('speech credentials tests', async(t) => { vendor: 'deepgram', use_for_stt: true, deepgram_stt_uri: "127.0.0.2:50002", - deepgram_stt_use_tls: false + deepgram_stt_use_tls: false, + deepgram_tts_uri: 'https://server2.com' } }); t.ok(result.statusCode === 204, 'successfully updated speech credential for deepgram onprem'); @@ -409,6 +412,7 @@ test('speech credentials tests', async(t) => { t.ok(result.statusCode === 200, 'successfully get speech credential for deepgram onprem'); t.ok(result.body.deepgram_stt_uri === '127.0.0.2:50002', "deepgram_stt_uri is correct for deepgram onprem"); t.ok(result.body.deepgram_stt_use_tls === false, "deepgram_stt_use_tls is correct for deepgram onprem"); + t.ok(result.body.deepgram_tts_uri === 'https://server2.com', "deepgram_tts_uri is correct for deepgram onprem"); result = await request.delete(`/Accounts/${account_sid}/SpeechCredentials/${dg_sid}`, { auth: authUser,