From 2d2b98dab585b175e6d6abec0ef6c1354a0d43e9 Mon Sep 17 00:00:00 2001
From: Hoan Luu Huu <110280845+xquanluu@users.noreply.github.com>
Date: Wed, 7 Aug 2024 18:24:58 +0700
Subject: [PATCH] Feat/deepgram tts onprem (#338)

* support deepgram onpremise

* wip

* update speech utils version

* install docker in ci
---
 .github/workflows/ci.yml             |  5 ++
 lib/routes/api/speech-credentials.js |  7 ++-
 lib/utils/speech-utils.js            | 20 +++++---
 package-lock.json                    | 68 ++++++++++++++++++++++++++--
 package.json                         |  2 +-
 test/speech-credentials.js           |  8 +++-
 6 files changed, 93 insertions(+), 17 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 9a06896..1abbc3e 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -7,6 +7,11 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v3
+      - name: Install Docker Compose
+        run: |
+          sudo curl -L "https://github.com/docker/compose/releases/download/1.29.2/docker-compose-$(uname -s)-$(uname -m)" -o /usr/local/bin/docker-compose
+          sudo chmod +x /usr/local/bin/docker-compose
+          docker-compose --version
       - uses: actions/setup-node@v3
         with:
           node-version: lts/*
diff --git a/lib/routes/api/speech-credentials.js b/lib/routes/api/speech-credentials.js
index a8bdc6b..02dbb30 100644
--- a/lib/routes/api/speech-credentials.js
+++ b/lib/routes/api/speech-credentials.js
@@ -124,6 +124,7 @@ const encryptCredential = (obj) => {
     nuance_stt_uri,
     deepgram_stt_uri,
     deepgram_stt_use_tls,
+    deepgram_tts_uri,
     use_custom_tts,
     custom_tts_endpoint,
     custom_tts_endpoint_url,
@@ -204,10 +205,10 @@ const encryptCredential = (obj) => {
 
     case 'deepgram':
       // API key is optional if onprem
-      if (!deepgram_stt_uri) {
+      if (!deepgram_stt_uri || !deepgram_tts_uri) {
         assert(api_key, 'invalid deepgram speech credential: api_key is required');
       }
-      const deepgramData = JSON.stringify({api_key, deepgram_stt_uri, deepgram_stt_use_tls});
+      const deepgramData = JSON.stringify({api_key, deepgram_stt_uri, deepgram_stt_use_tls, deepgram_tts_uri});
       return encrypt(deepgramData);
 
     case 'ibm':
@@ -458,6 +459,7 @@ router.put('/:sid', async(req, res) => {
           options,
           deepgram_stt_uri,
           deepgram_stt_use_tls,
+          deepgram_tts_uri,
           engine_version
         } = req.body;
 
@@ -485,6 +487,7 @@ router.put('/:sid', async(req, res) => {
           options,
           deepgram_stt_uri,
           deepgram_stt_use_tls,
+          deepgram_tts_uri,
           engine_version
         };
         logger.info({o, newCred}, 'updating speech credential with this new credential');
diff --git a/lib/utils/speech-utils.js b/lib/utils/speech-utils.js
index 096ed25..97352e2 100644
--- a/lib/utils/speech-utils.js
+++ b/lib/utils/speech-utils.js
@@ -92,8 +92,8 @@ const testGoogleStt = async(logger, credentials) => {
 };
 
 const testDeepgramStt = async(logger, credentials) => {
-  const {api_key} = credentials;
-  const deepgram = new Deepgram(api_key);
+  const {api_key, deepgram_stt_uri, deepgram_stt_use_tls} = credentials;
+  const deepgram = new Deepgram(api_key, deepgram_stt_uri, deepgram_stt_uri && deepgram_stt_use_tls);
 
   const mimetype = 'audio/wav';
   const source = {
@@ -272,7 +272,8 @@ const testPlayHT = async(logger, synthAudio, credentials) => {
         credentials,
         language: 'en-US',
         voice: 's3://voice-cloning-zero-shot/d9ff78ba-d016-47f6-b0ef-dd630f59414e/female-cs/manifest.json',
-        text: 'Hi there and welcome to jambones!'
+        text: 'Hi there and welcome to jambones!',
+        renderForCaching: true
       }
     );
     // Test if playHT can fetch voices
@@ -295,7 +296,8 @@ const testRimelabs = async(logger, synthAudio, credentials) => {
         credentials,
         language: 'en-US',
         voice: 'amber',
-        text: 'Hi there and welcome to jambones!'
+        text: 'Hi there and welcome to jambones!',
+        renderForCaching: true
       }
     );
   } catch (err) {
@@ -312,7 +314,8 @@ const testWhisper = async(logger, synthAudio, credentials) => {
         credentials,
         language: 'en-US',
         voice: 'alloy',
-        text: 'Hi there and welcome to jambones!'
+        text: 'Hi there and welcome to jambones!',
+        renderForCaching: true
       }
     );
   } catch (err) {
@@ -328,7 +331,8 @@ const testDeepgramTTS = async(logger, synthAudio, credentials) => {
         vendor: 'deepgram',
         credentials,
         model: 'aura-asteria-en',
-        text: 'Hi there and welcome to jambones!'
+        text: 'Hi there and welcome to jambones!',
+        renderForCaching: true
       }
     );
   } catch (err) {
@@ -383,7 +387,8 @@ const testVerbioTts = async(logger, synthAudio, credentials) => {
         credentials,
         language: 'en-US',
         voice: 'tommy_en-us',
-        text: 'Hi there and welcome to jambones!'
+        text: 'Hi there and welcome to jambones!',
+        renderForCaching: true
       }
     );
   } catch (err) {
@@ -509,6 +514,7 @@ function decryptCredential(obj, credential, logger, isObscureKey = true) {
     obj.api_key = isObscureKey ? obscureKey(o.api_key) : o.api_key;
     obj.deepgram_stt_uri = o.deepgram_stt_uri;
     obj.deepgram_stt_use_tls = o.deepgram_stt_use_tls;
+    obj.deepgram_tts_uri = o.deepgram_tts_uri;
   }
   else if ('ibm' === obj.vendor) {
     const o = JSON.parse(decrypt(credential));
diff --git a/package-lock.json b/package-lock.json
index 6804e30..d309b97 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -19,7 +19,7 @@
         "@jambonz/lamejs": "^1.2.2",
         "@jambonz/mw-registrar": "^0.2.7",
         "@jambonz/realtimedb-helpers": "^0.8.9",
-        "@jambonz/speech-utils": "^0.1.11",
+        "@jambonz/speech-utils": "^0.1.13",
         "@jambonz/time-series": "^0.2.8",
         "@jambonz/verb-specifications": "^0.0.72",
         "@soniox/soniox-node": "^1.2.2",
@@ -2027,9 +2027,10 @@
       }
     },
     "node_modules/@jambonz/speech-utils": {
-      "version": "0.1.11",
-      "resolved": "https://registry.npmjs.org/@jambonz/speech-utils/-/speech-utils-0.1.11.tgz",
-      "integrity": "sha512-VgljBLUF871adib/3yWpzd7kv26ioxiLVkAIxm94CSk9WeZuzX1lVcE2SohojW3mjCYdYY6+B8FRyzlTD+en3g==",
+      "version": "0.1.13",
+      "resolved": "https://registry.npmjs.org/@jambonz/speech-utils/-/speech-utils-0.1.13.tgz",
+      "integrity": "sha512-QeVmNFLtJGPGQfmp7jXpy742AyJIv2EteelDmNTqWGFEwTBj88q8GLP51hUsIR2ZbE5n/ZmZb/ytT6Y6LIQSDg==",
+      "license": "MIT",
       "dependencies": {
         "@aws-sdk/client-polly": "^3.496.0",
         "@aws-sdk/client-sts": "^3.496.0",
@@ -2041,7 +2042,7 @@
         "form-urlencoded": "^6.1.4",
         "google-protobuf": "^3.21.2",
         "ibm-watson": "^8.0.0",
-        "microsoft-cognitiveservices-speech-sdk": "1.36.0",
+        "microsoft-cognitiveservices-speech-sdk": "1.38.0",
         "openai": "^4.25.0",
         "undici": "^6.4.0"
       }
@@ -2051,6 +2052,28 @@
       "resolved": "https://registry.npmjs.org/@types/node/-/node-13.13.52.tgz",
       "integrity": "sha512-s3nugnZumCC//n4moGGe6tkNMyYEdaDBitVjwPxXmR5lnMG5dHePinH2EdxkG3Rh1ghFHHixAG4NJhpJW1rthQ=="
     },
+    "node_modules/@jambonz/speech-utils/node_modules/https-proxy-agent": {
+      "version": "4.0.0",
+      "resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-4.0.0.tgz",
+      "integrity": "sha512-zoDhWrkR3of1l9QAL8/scJZyLu8j/gBkcwcaQOZh7Gyh/+uJQzGVETdgT30akuwkpL8HTRfssqI3BZuV18teDg==",
+      "license": "MIT",
+      "dependencies": {
+        "agent-base": "5",
+        "debug": "4"
+      },
+      "engines": {
+        "node": ">= 6.0.0"
+      }
+    },
+    "node_modules/@jambonz/speech-utils/node_modules/https-proxy-agent/node_modules/agent-base": {
+      "version": "5.1.1",
+      "resolved": "https://registry.npmjs.org/agent-base/-/agent-base-5.1.1.tgz",
+      "integrity": "sha512-TMeqbNl2fMW0nMjTEPOwe3J/PRFP4vqeoNuQMG0HlMrtm5QxKqdvAkZ1pRBQ/ulIyDD5Yq0nJ7YbdD8ey0TO3g==",
+      "license": "MIT",
+      "engines": {
+        "node": ">= 6.0.0"
+      }
+    },
     "node_modules/@jambonz/speech-utils/node_modules/ibm-watson": {
       "version": "8.0.0",
       "resolved": "https://registry.npmjs.org/ibm-watson/-/ibm-watson-8.0.0.tgz",
@@ -2072,6 +2095,41 @@
         "node": ">=16.0.0"
       }
     },
+    "node_modules/@jambonz/speech-utils/node_modules/microsoft-cognitiveservices-speech-sdk": {
+      "version": "1.38.0",
+      "resolved": "https://registry.npmjs.org/microsoft-cognitiveservices-speech-sdk/-/microsoft-cognitiveservices-speech-sdk-1.38.0.tgz",
+      "integrity": "sha512-NA6J4eIDkeR9iN83rcn77Kn5AWQcizDEn1tLMjzRvSovUNB1FrZe0mWYO0fsGltUwMl3Ns5OZ3lGw42PU4fEYA==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/webrtc": "^0.0.37",
+        "agent-base": "^6.0.1",
+        "bent": "^7.3.12",
+        "https-proxy-agent": "^4.0.0",
+        "uuid": "^9.0.0",
+        "ws": "^7.5.6"
+      }
+    },
+    "node_modules/@jambonz/speech-utils/node_modules/ws": {
+      "version": "7.5.10",
+      "resolved": "https://registry.npmjs.org/ws/-/ws-7.5.10.tgz",
+      "integrity": "sha512-+dbF1tHwZpXcbOJdVOkzLDxZP1ailvSxM6ZweXTegylPny803bFhA+vqBYw4s31NSAk4S2Qz+AKXK9a4wkdjcQ==",
+      "license": "MIT",
+      "engines": {
+        "node": ">=8.3.0"
+      },
+      "peerDependencies": {
+        "bufferutil": "^4.0.1",
+        "utf-8-validate": "^5.0.2"
+      },
+      "peerDependenciesMeta": {
+        "bufferutil": {
+          "optional": true
+        },
+        "utf-8-validate": {
+          "optional": true
+        }
+      }
+    },
     "node_modules/@jambonz/time-series": {
       "version": "0.2.8",
       "resolved": "https://registry.npmjs.org/@jambonz/time-series/-/time-series-0.2.8.tgz",
diff --git a/package.json b/package.json
index 8db0e63..d5edc8e 100644
--- a/package.json
+++ b/package.json
@@ -29,7 +29,7 @@
     "@jambonz/lamejs": "^1.2.2",
     "@jambonz/mw-registrar": "^0.2.7",
     "@jambonz/realtimedb-helpers": "^0.8.9",
-    "@jambonz/speech-utils": "^0.1.11",
+    "@jambonz/speech-utils": "^0.1.13",
     "@jambonz/time-series": "^0.2.8",
     "@jambonz/verb-specifications": "^0.0.72",
     "@soniox/soniox-node": "^1.2.2",
diff --git a/test/speech-credentials.js b/test/speech-credentials.js
index f650212..c5a0752 100644
--- a/test/speech-credentials.js
+++ b/test/speech-credentials.js
@@ -371,7 +371,8 @@ test('speech credentials tests', async(t) => {
         vendor: 'deepgram',
         use_for_stt: true,
         deepgram_stt_uri: "127.0.0.1:50002",
-        deepgram_stt_use_tls: true
+        deepgram_stt_use_tls: true,
+        deepgram_tts_uri: 'https://server.com'
       }
     });
     t.ok(result.statusCode === 201, 'successfully added speech credential for deepgram');
@@ -386,6 +387,7 @@ test('speech credentials tests', async(t) => {
     t.ok(result.statusCode === 200, 'successfully get speech credential for deepgram');
     t.ok(result.body.deepgram_stt_uri === '127.0.0.1:50002', "deepgram_stt_uri is correct for deepgram");
     t.ok(result.body.deepgram_stt_use_tls === true, "deepgram_stt_use_tls is correct for deepgram");
+    t.ok(result.body.deepgram_tts_uri === 'https://server.com', "deepgram_tts_uri is correct for deepgram")
 
     result = await request.put(`/Accounts/${account_sid}/SpeechCredentials/${dg_sid}`, {
       resolveWithFullResponse: true,
@@ -395,7 +397,8 @@ test('speech credentials tests', async(t) => {
         vendor: 'deepgram',
         use_for_stt: true,
         deepgram_stt_uri: "127.0.0.2:50002",
-        deepgram_stt_use_tls: false
+        deepgram_stt_use_tls: false,
+        deepgram_tts_uri: 'https://server2.com'
       }
     });
     t.ok(result.statusCode === 204, 'successfully updated speech credential for deepgram onprem');
@@ -409,6 +412,7 @@ test('speech credentials tests', async(t) => {
     t.ok(result.statusCode === 200, 'successfully get speech credential for deepgram onprem');
     t.ok(result.body.deepgram_stt_uri === '127.0.0.2:50002', "deepgram_stt_uri is correct for deepgram onprem");
     t.ok(result.body.deepgram_stt_use_tls === false, "deepgram_stt_use_tls is correct for deepgram onprem");
+    t.ok(result.body.deepgram_tts_uri === 'https://server2.com', "deepgram_tts_uri is correct for deepgram onprem");
 
     result = await request.delete(`/Accounts/${account_sid}/SpeechCredentials/${dg_sid}`, {
       auth: authUser,