support stt speechmatics (#353)

* support stt speechmatics * support speechmatics region authentication * update testcase for speechmatics_stt_uri
2026-01-25 02:08:24 +00:00 · 2024-10-11 20:17:40 +07:00
parent 77b9ca4cba
commit 6e779f6744
5 changed files with 156 additions and 2 deletions
--- a/lib/routes/api/speech-credentials.js
+++ b/lib/routes/api/speech-credentials.js
@@ -10,7 +10,8 @@ const {decryptCredential, testWhisper, testDeepgramTTS,
  testPlayHT,
  testRimelabs,
  testVerbioTts,
-  testVerbioStt} = require('../../utils/speech-utils');
+  testVerbioStt,
+  testSpeechmaticsStt} = require('../../utils/speech-utils');
 const {DbErrorUnprocessableRequest, DbErrorForbidden, DbErrorBadRequest} = require('../../utils/errors');
 const {
  testGoogleTts,
@@ -122,6 +123,7 @@ const encryptCredential = (obj) => {
    secret,
    nuance_tts_uri,
    nuance_stt_uri,
+    speechmatics_stt_uri,
    deepgram_stt_uri,
    deepgram_stt_use_tls,
    deepgram_tts_uri,
@@ -236,6 +238,12 @@ const encryptCredential = (obj) => {
      const elevenlabsData = JSON.stringify({api_key, model_id, options});
      return encrypt(elevenlabsData);

+    case 'speechmatics':
+      assert(api_key, 'invalid speechmatics speech credential: api_key is required');
+      assert(speechmatics_stt_uri, 'invalid speechmatics speech credential: speechmatics_stt_uri is required');
+      const speechmaticsData = JSON.stringify({api_key, speechmatics_stt_uri, options});
+      return encrypt(speechmaticsData);
+
    case 'playht':
      assert(api_key, 'invalid playht speech credential: api_key is required');
      assert(user_id, 'invalid playht speech credential: user_id is required');
@@ -768,6 +776,18 @@ router.get('/:sid/test', async(req, res) => {
          SpeechCredential.ttsTestResult(sid, false);
        }
      }
+    } else if (cred.vendor === 'speechmatics') {
+      const {api_key} = credential;
+      if (cred.use_for_stt) {
+        try {
+          await testSpeechmaticsStt(logger, {api_key});
+          results.stt.status = 'ok';
+          SpeechCredential.ttsTestResult(sid, true);
+        } catch (err) {
+          results.stt = {status: 'fail', reason: err.message};
+          SpeechCredential.ttsTestResult(sid, false);
+        }
+      }
    } else if (cred.vendor === 'playht') {
      if (cred.use_for_tts) {
        try {
--- a/lib/utils/speech-utils.js
+++ b/lib/utils/speech-utils.js
@@ -7,6 +7,7 @@ const bent = require('bent');
 const fs = require('fs');
 const { AssemblyAI } = require('assemblyai');
 const {decrypt, obscureKey} = require('./encrypt-decrypt');
+const { RealtimeSession } = require('speechmatics');

 const TtsGoogleLanguagesVoices = require('./speech-data/tts-google');
 const TtsAwsLanguagesVoices = require('./speech-data/tts-aws');
@@ -54,6 +55,61 @@ const testSonioxStt = async(logger, credentials) => {
  });
 };

+const testSpeechmaticsStt = async(logger, credentials) => {
+  const {api_key, speechmatics_stt_uri} = credentials;
+  return new Promise(async(resolve, reject) => {
+    try {
+      const session = new RealtimeSession({ apiKey: api_key, realtimeUrl: speechmatics_stt_uri });
+      let transcription = '';
+      session.addListener('Error', (error) => {
+        reject(error);
+      });
+
+      session.addListener('AddTranscript', (message) => {
+        transcription += message.metadata.transcript;
+      });
+
+      session.addListener('EndOfTranscript', () => {
+        resolve(transcription);
+      });
+
+      session
+        .start({
+          transcription_config: {
+            language: 'en',
+            operating_point: 'enhanced',
+            enable_partials: true,
+            max_delay: 2,
+          },
+          audio_format: { type: 'file' },
+        })
+        .then(() => {
+          //prepare file stream
+          const fileStream = fs.createReadStream(`${__dirname}/../../data/test_audio.wav`);
+
+          //send it
+          fileStream.on('data', (sample) => {
+            session.sendAudio(sample);
+          });
+
+          //end the session
+          fileStream.on('end', () => {
+            session.stop();
+          });
+
+          return;
+
+        })
+        .catch((error) => {
+          reject(error);
+        });
+    } catch (error) {
+      logger.info({error}, 'failed to get speechmatics transcript');
+      reject(error);
+    }
+  });
+};
+
 const testNuanceTts = async(logger, getTtsVoices, credentials) => {
  const voices = await getTtsVoices({vendor: 'nuance', credentials});
  return voices;
@@ -532,6 +588,10 @@ function decryptCredential(obj, credential, logger, isObscureKey = true) {
  } else if ('soniox' === obj.vendor) {
    const o = JSON.parse(decrypt(credential));
    obj.api_key = isObscureKey ? obscureKey(o.api_key) : o.api_key;
+  } else if ('speechmatics' === obj.vendor) {
+    const o = JSON.parse(decrypt(credential));
+    obj.api_key = isObscureKey ? obscureKey(o.api_key) : o.api_key;
+    obj.speechmatics_stt_uri = o.speechmatics_stt_uri;
  } else if ('elevenlabs' === obj.vendor) {
    const o = JSON.parse(decrypt(credential));
    obj.api_key = isObscureKey ? obscureKey(o.api_key) : o.api_key;
@@ -1066,5 +1126,6 @@ module.exports = {
  testWhisper,
  testVerbioTts,
  testVerbioStt,
-  getLanguagesAndVoicesForVendor
+  getLanguagesAndVoicesForVendor,
+  testSpeechmaticsStt
 };