support openai stt (#402)

* support openai stt * wip * wip * add stt languages for openai
2025-12-19 05:47:46 +00:00 · 2025-03-28 21:14:50 +07:00
parent 32a2bfcdb5
commit 1c55bad04f
5 changed files with 149 additions and 7 deletions
--- a/lib/routes/api/speech-credentials.js
+++ b/lib/routes/api/speech-credentials.js
@@ -13,7 +13,8 @@ const {decryptCredential, testWhisper, testDeepgramTTS,
  testVerbioStt,
  testSpeechmaticsStt,
  testCartesia,
-  testVoxistStt} = require('../../utils/speech-utils');
+  testVoxistStt,
+  testOpenAiStt} = require('../../utils/speech-utils');
 const {DbErrorUnprocessableRequest, DbErrorForbidden, DbErrorBadRequest} = require('../../utils/errors');
 const {
  testGoogleTts,
@@ -282,6 +283,12 @@ const encryptCredential = (obj) => {
      const whisperData = JSON.stringify({api_key, model_id});
      return encrypt(whisperData);

+    case 'openai':
+      assert(api_key, 'invalid openai speech credential: api_key is required');
+      assert(model_id, 'invalid openai speech credential: model_id is required');
+      const openaiData = JSON.stringify({api_key, model_id});
+      return encrypt(openaiData);
+
    case 'verbio':
      assert(engine_version, 'invalid verbio speech credential: client_id is required');
      assert(client_id, 'invalid verbio speech credential: client_id is required');
@@ -882,6 +889,17 @@ router.get('/:sid/test', async(req, res) => {
          SpeechCredential.ttsTestResult(sid, false);
        }
      }
+    } else if (cred.vendor === 'openai') {
+      if (cred.use_for_stt) {
+        try {
+          await testOpenAiStt(logger, credential);
+          results.stt.status = 'ok';
+          SpeechCredential.sttTestResult(sid, true);
+        } catch (err) {
+          results.stt = {status: 'fail', reason: err.message};
+          SpeechCredential.sttTestResult(sid, false);
+        }
+      }
    } else if (cred.vendor === 'verbio') {
      if (cred.use_for_tts) {
        try {
--- a/lib/utils/speech-data/stt-model-openai.js
+++ b/lib/utils/speech-data/stt-model-openai.js
@@ -0,0 +1,6 @@
+module.exports = [
+  { name: 'Whisper', value: 'whisper-1' },
+  { name: 'GPT 4o Mini Transcribe', value: 'gpt-4o-mini-transcribe' },
+  { name: 'GLT 4o Transcribe', value: 'gpt-4o-transcribe' },
+];
+
--- a/lib/utils/speech-data/stt-openai.js
+++ b/lib/utils/speech-data/stt-openai.js
@@ -0,0 +1,59 @@
+module.exports = [
+  { name: 'Afrikaans', value: 'af' },
+  { name: 'Arabic', value: 'ar' },
+  { name: 'Azerbaijani', value: 'az' },
+  { name: 'Belarusian', value: 'be' },
+  { name: 'Bulgarian', value: 'bg' },
+  { name: 'Bosnian', value: 'bs' },
+  { name: 'Catalan', value: 'ca' },
+  { name: 'Czech', value: 'cs' },
+  { name: 'Welsh', value: 'cy' },
+  { name: 'Danish', value: 'da' },
+  { name: 'German', value: 'de' },
+  { name: 'Greek', value: 'el' },
+  { name: 'English', value: 'en' },
+  { name: 'Spanish', value: 'es' },
+  { name: 'Estonian', value: 'et' },
+  { name: 'Persian', value: 'fa' },
+  { name: 'Finnish', value: 'fi' },
+  { name: 'French', value: 'fr' },
+  { name: 'Galician', value: 'gl' },
+  { name: 'Hebrew', value: 'he' },
+  { name: 'Hindi', value: 'hi' },
+  { name: 'Croatian', value: 'hr' },
+  { name: 'Hungarian', value: 'hu' },
+  { name: 'Armenian', value: 'hy' },
+  { name: 'Indonesian', value: 'id' },
+  { name: 'Icelandic', value: 'is' },
+  { name: 'Italian', value: 'it' },
+  { name: 'Japanese', value: 'ja' },
+  { name: 'Kazakh', value: 'kk' },
+  { name: 'Kannada', value: 'kn' },
+  { name: 'Korean', value: 'ko' },
+  { name: 'Lithuanian', value: 'lt' },
+  { name: 'Latvian', value: 'lv' },
+  { name: 'Maori', value: 'mi' },
+  { name: 'Macedonian', value: 'mk' },
+  { name: 'Marathi', value: 'mr' },
+  { name: 'Malay', value: 'ms' },
+  { name: 'Nepali', value: 'ne' },
+  { name: 'Dutch', value: 'nl' },
+  { name: 'Norwegian', value: 'no' },
+  { name: 'Polish', value: 'pl' },
+  { name: 'Portuguese', value: 'pt' },
+  { name: 'Romanian', value: 'ro' },
+  { name: 'Russian', value: 'ru' },
+  { name: 'Slovak', value: 'sk' },
+  { name: 'Slovenian', value: 'sl' },
+  { name: 'Serbian', value: 'sr' },
+  { name: 'Swedish', value: 'sv' },
+  { name: 'Swahili', value: 'sw' },
+  { name: 'Tamil', value: 'ta' },
+  { name: 'Thai', value: 'th' },
+  { name: 'Tagalog', value: 'tl' },
+  { name: 'Turkish', value: 'tr' },
+  { name: 'Ukrainian', value: 'uk' },
+  { name: 'Urdu', value: 'ur' },
+  { name: 'Vietnamese', value: 'vi' },
+  { name: 'Chinese', value: 'zh' },
+];
--- a/lib/utils/speech-data/tts-model-openai.js
+++ b/lib/utils/speech-data/tts-model-openai.js
@@ -0,0 +1,6 @@
+module.exports = [
+  { name: 'TTS-1', value: 'tts-1' },
+  { name: 'TTS-1-HD', value: 'tts-1-hd' },
+  { name: 'GPT-4o-Mini-TTS', value: 'gpt-4o-mini-tts' },
+];
+
--- a/lib/utils/speech-utils.js
+++ b/lib/utils/speech-utils.js
@@ -20,6 +20,7 @@ const TtsElevenlabsLanguagesVoices = require('./speech-data/tts-elevenlabs');
 const TtsWhisperLanguagesVoices = require('./speech-data/tts-whisper');
 const TtsPlayHtLanguagesVoices = require('./speech-data/tts-playht');
 const TtsVerbioLanguagesVoices = require('./speech-data/tts-verbio');
+const ttsCartesia = require('./speech-data/tts-cartesia');

 const TtsModelDeepgram = require('./speech-data/tts-model-deepgram');
 const TtsLanguagesDeepgram = require('./speech-data/tts-deepgram');
@@ -29,6 +30,7 @@ const TtsModelPlayHT = require('./speech-data/tts-model-playht');
 const ttsLanguagesPlayHt = require('./speech-data/tts-languages-playht');
 const TtsModelRimelabs = require('./speech-data/tts-model-rimelabs');
 const TtsModelCartesia = require('./speech-data/tts-model-cartesia');
+const TtsModelOpenai = require('./speech-data/tts-model-openai');

 const SttGoogleLanguagesVoices = require('./speech-data/stt-google');
 const SttAwsLanguagesVoices = require('./speech-data/stt-aws');
@@ -43,8 +45,10 @@ const SttSpeechmaticsLanguagesVoices = require('./speech-data/stt-speechmatics')
 const SttAssemblyaiLanguagesVoices = require('./speech-data/stt-assemblyai');
 const SttVoxistLanguagesVoices = require('./speech-data/stt-voxist');
 const SttVerbioLanguagesVoices = require('./speech-data/stt-verbio');
-const ttsCartesia = require('./speech-data/tts-cartesia');
-const ttsModelCartesia = require('./speech-data/tts-model-cartesia');
+const SttOpenaiLanguagesVoices = require('./speech-data/stt-openai');
+
+
+const SttModelOpenai = require('./speech-data/stt-model-openai');


 const testSonioxStt = async(logger, credentials) => {
@@ -477,6 +481,43 @@ const testVerbioStt = async(logger, getVerbioAccessToken, credentials) => {
  }
 };

+const testOpenAiStt = async(logger, credentials) => {
+  const {api_key} = credentials;
+  try {
+    // Create a FormData object to properly format the multipart request
+    const formData = new FormData();
+
+    // Add the audio file as 'file' field
+    const audioBuffer = fs.readFileSync(`${__dirname}/../../data/test_audio.wav`);
+    const blob = new Blob([audioBuffer], { type: 'audio/wav' });
+    formData.append('file', blob, 'audio.wav');
+
+    // Add the model parameter (required by OpenAI)
+    formData.append('model', 'whisper-1');
+
+    // Make the request using fetch
+    const response = await fetch('https://api.openai.com/v1/audio/transcriptions', {
+      method: 'POST',
+      headers: {
+        'Authorization': `Bearer ${api_key}`,
+        'User-Agent': 'jambonz'
+      },
+      body: formData
+    });
+
+    if (!response.ok) {
+      throw new Error(`OpenAI API error: ${response.status} ${(await response.json()).error?.message}`);
+    }
+
+    const json = await response.json();
+    logger.debug({json}, 'successfully speech to text from OpenAI');
+    return json;
+  } catch (err) {
+    logger.info({err}, 'OpenAI speech-to-text request failed');
+    throw err;
+  }
+};
+
 const testAssemblyStt = async(logger, credentials) => {
  const {api_key} = credentials;

@@ -651,6 +692,10 @@ function decryptCredential(obj, credential, logger, isObscureKey = true) {
    const o = JSON.parse(decrypt(credential));
    obj.api_key = isObscureKey ? obscureKey(o.api_key) : o.api_key;
    obj.model_id = o.model_id;
+  } else if ('openai' === obj.vendor) {
+    const o = JSON.parse(decrypt(credential));
+    obj.api_key = isObscureKey ? obscureKey(o.api_key) : o.api_key;
+    obj.model_id = o.model_id;
  } else if ('verbio' === obj.vendor) {
    const o = JSON.parse(decrypt(credential));
    obj.client_id = o.client_id;
@@ -714,6 +759,8 @@ async function getLanguagesAndVoicesForVendor(logger, vendor, credential, getTts
      return await getLanguagesVoicesForVoxist(credential, getTtsVoices, logger);
    case 'whisper':
      return await getLanguagesVoicesForWhisper(credential, getTtsVoices, logger);
+    case 'openai':
+      return await getLanguagesVoicesForOpenAi(credential, getTtsVoices, logger);
    case 'verbio':
      return await getLanguagesVoicesForVerbio(credential, getTtsVoices, logger);
    case 'speechmatics':
@@ -1014,6 +1061,10 @@ async function getLanguagesVoicesForWhisper(credential) {
  return tranform(TtsWhisperLanguagesVoices, undefined, TtsModelWhisper);
 }

+async function getLanguagesVoicesForOpenAi(credential) {
+  return tranform(undefined, SttOpenaiLanguagesVoices, TtsModelOpenai, SttModelOpenai);
+}
+
 async function getLanguagesVoicesForVerbio(credentials, getTtsVoices, logger) {
  const stt = SttVerbioLanguagesVoices.reduce((acc, v) => {
    if (!v.version || (credentials && credentials.engine_version === v.version)) {
@@ -1034,11 +1085,12 @@ async function getLanguagesVoicesForVerbio(credentials, getTtsVoices, logger) {
  }
 }

-function tranform(tts, stt, models) {
+function tranform(tts, stt, models, sttModels) {
  return {
    ...(tts && {tts}),
    ...(stt && {stt}),
-    ...(models && {models})
+    ...(models && {models}),
+    ...(sttModels && {sttModels})
  };
 }

@@ -1224,7 +1276,7 @@ const testCartesia = async(logger, synthAudio, credentials) => {
 async function getLanguagesVoicesForCartesia(credential) {
  if (credential) {
    const {model_id} = credential;
-    const {languages} = ttsModelCartesia.find((m) => m.value === model_id);
+    const {languages} = TtsModelCartesia.find((m) => m.value === model_id);
    const voices = await fetchCartesiaVoices(credential);

    const buildVoice = (d) => (
@@ -1301,5 +1353,6 @@ module.exports = {
  getLanguagesAndVoicesForVendor,
  testSpeechmaticsStt,
  testCartesia,
-  testVoxistStt
+  testVoxistStt,
+  testOpenAiStt
 };