mirror of
https://github.com/jambonz/jambonz-api-server.git
synced 2025-12-19 05:47:46 +00:00
support openai stt (#402)
* support openai stt * wip * wip * add stt languages for openai
This commit is contained in:
@@ -13,7 +13,8 @@ const {decryptCredential, testWhisper, testDeepgramTTS,
|
||||
testVerbioStt,
|
||||
testSpeechmaticsStt,
|
||||
testCartesia,
|
||||
testVoxistStt} = require('../../utils/speech-utils');
|
||||
testVoxistStt,
|
||||
testOpenAiStt} = require('../../utils/speech-utils');
|
||||
const {DbErrorUnprocessableRequest, DbErrorForbidden, DbErrorBadRequest} = require('../../utils/errors');
|
||||
const {
|
||||
testGoogleTts,
|
||||
@@ -282,6 +283,12 @@ const encryptCredential = (obj) => {
|
||||
const whisperData = JSON.stringify({api_key, model_id});
|
||||
return encrypt(whisperData);
|
||||
|
||||
case 'openai':
|
||||
assert(api_key, 'invalid openai speech credential: api_key is required');
|
||||
assert(model_id, 'invalid openai speech credential: model_id is required');
|
||||
const openaiData = JSON.stringify({api_key, model_id});
|
||||
return encrypt(openaiData);
|
||||
|
||||
case 'verbio':
|
||||
assert(engine_version, 'invalid verbio speech credential: client_id is required');
|
||||
assert(client_id, 'invalid verbio speech credential: client_id is required');
|
||||
@@ -882,6 +889,17 @@ router.get('/:sid/test', async(req, res) => {
|
||||
SpeechCredential.ttsTestResult(sid, false);
|
||||
}
|
||||
}
|
||||
} else if (cred.vendor === 'openai') {
|
||||
if (cred.use_for_stt) {
|
||||
try {
|
||||
await testOpenAiStt(logger, credential);
|
||||
results.stt.status = 'ok';
|
||||
SpeechCredential.sttTestResult(sid, true);
|
||||
} catch (err) {
|
||||
results.stt = {status: 'fail', reason: err.message};
|
||||
SpeechCredential.sttTestResult(sid, false);
|
||||
}
|
||||
}
|
||||
} else if (cred.vendor === 'verbio') {
|
||||
if (cred.use_for_tts) {
|
||||
try {
|
||||
|
||||
6
lib/utils/speech-data/stt-model-openai.js
Normal file
6
lib/utils/speech-data/stt-model-openai.js
Normal file
@@ -0,0 +1,6 @@
|
||||
module.exports = [
|
||||
{ name: 'Whisper', value: 'whisper-1' },
|
||||
{ name: 'GPT 4o Mini Transcribe', value: 'gpt-4o-mini-transcribe' },
|
||||
{ name: 'GLT 4o Transcribe', value: 'gpt-4o-transcribe' },
|
||||
];
|
||||
|
||||
59
lib/utils/speech-data/stt-openai.js
Normal file
59
lib/utils/speech-data/stt-openai.js
Normal file
@@ -0,0 +1,59 @@
|
||||
module.exports = [
|
||||
{ name: 'Afrikaans', value: 'af' },
|
||||
{ name: 'Arabic', value: 'ar' },
|
||||
{ name: 'Azerbaijani', value: 'az' },
|
||||
{ name: 'Belarusian', value: 'be' },
|
||||
{ name: 'Bulgarian', value: 'bg' },
|
||||
{ name: 'Bosnian', value: 'bs' },
|
||||
{ name: 'Catalan', value: 'ca' },
|
||||
{ name: 'Czech', value: 'cs' },
|
||||
{ name: 'Welsh', value: 'cy' },
|
||||
{ name: 'Danish', value: 'da' },
|
||||
{ name: 'German', value: 'de' },
|
||||
{ name: 'Greek', value: 'el' },
|
||||
{ name: 'English', value: 'en' },
|
||||
{ name: 'Spanish', value: 'es' },
|
||||
{ name: 'Estonian', value: 'et' },
|
||||
{ name: 'Persian', value: 'fa' },
|
||||
{ name: 'Finnish', value: 'fi' },
|
||||
{ name: 'French', value: 'fr' },
|
||||
{ name: 'Galician', value: 'gl' },
|
||||
{ name: 'Hebrew', value: 'he' },
|
||||
{ name: 'Hindi', value: 'hi' },
|
||||
{ name: 'Croatian', value: 'hr' },
|
||||
{ name: 'Hungarian', value: 'hu' },
|
||||
{ name: 'Armenian', value: 'hy' },
|
||||
{ name: 'Indonesian', value: 'id' },
|
||||
{ name: 'Icelandic', value: 'is' },
|
||||
{ name: 'Italian', value: 'it' },
|
||||
{ name: 'Japanese', value: 'ja' },
|
||||
{ name: 'Kazakh', value: 'kk' },
|
||||
{ name: 'Kannada', value: 'kn' },
|
||||
{ name: 'Korean', value: 'ko' },
|
||||
{ name: 'Lithuanian', value: 'lt' },
|
||||
{ name: 'Latvian', value: 'lv' },
|
||||
{ name: 'Maori', value: 'mi' },
|
||||
{ name: 'Macedonian', value: 'mk' },
|
||||
{ name: 'Marathi', value: 'mr' },
|
||||
{ name: 'Malay', value: 'ms' },
|
||||
{ name: 'Nepali', value: 'ne' },
|
||||
{ name: 'Dutch', value: 'nl' },
|
||||
{ name: 'Norwegian', value: 'no' },
|
||||
{ name: 'Polish', value: 'pl' },
|
||||
{ name: 'Portuguese', value: 'pt' },
|
||||
{ name: 'Romanian', value: 'ro' },
|
||||
{ name: 'Russian', value: 'ru' },
|
||||
{ name: 'Slovak', value: 'sk' },
|
||||
{ name: 'Slovenian', value: 'sl' },
|
||||
{ name: 'Serbian', value: 'sr' },
|
||||
{ name: 'Swedish', value: 'sv' },
|
||||
{ name: 'Swahili', value: 'sw' },
|
||||
{ name: 'Tamil', value: 'ta' },
|
||||
{ name: 'Thai', value: 'th' },
|
||||
{ name: 'Tagalog', value: 'tl' },
|
||||
{ name: 'Turkish', value: 'tr' },
|
||||
{ name: 'Ukrainian', value: 'uk' },
|
||||
{ name: 'Urdu', value: 'ur' },
|
||||
{ name: 'Vietnamese', value: 'vi' },
|
||||
{ name: 'Chinese', value: 'zh' },
|
||||
];
|
||||
6
lib/utils/speech-data/tts-model-openai.js
Normal file
6
lib/utils/speech-data/tts-model-openai.js
Normal file
@@ -0,0 +1,6 @@
|
||||
module.exports = [
|
||||
{ name: 'TTS-1', value: 'tts-1' },
|
||||
{ name: 'TTS-1-HD', value: 'tts-1-hd' },
|
||||
{ name: 'GPT-4o-Mini-TTS', value: 'gpt-4o-mini-tts' },
|
||||
];
|
||||
|
||||
@@ -20,6 +20,7 @@ const TtsElevenlabsLanguagesVoices = require('./speech-data/tts-elevenlabs');
|
||||
const TtsWhisperLanguagesVoices = require('./speech-data/tts-whisper');
|
||||
const TtsPlayHtLanguagesVoices = require('./speech-data/tts-playht');
|
||||
const TtsVerbioLanguagesVoices = require('./speech-data/tts-verbio');
|
||||
const ttsCartesia = require('./speech-data/tts-cartesia');
|
||||
|
||||
const TtsModelDeepgram = require('./speech-data/tts-model-deepgram');
|
||||
const TtsLanguagesDeepgram = require('./speech-data/tts-deepgram');
|
||||
@@ -29,6 +30,7 @@ const TtsModelPlayHT = require('./speech-data/tts-model-playht');
|
||||
const ttsLanguagesPlayHt = require('./speech-data/tts-languages-playht');
|
||||
const TtsModelRimelabs = require('./speech-data/tts-model-rimelabs');
|
||||
const TtsModelCartesia = require('./speech-data/tts-model-cartesia');
|
||||
const TtsModelOpenai = require('./speech-data/tts-model-openai');
|
||||
|
||||
const SttGoogleLanguagesVoices = require('./speech-data/stt-google');
|
||||
const SttAwsLanguagesVoices = require('./speech-data/stt-aws');
|
||||
@@ -43,8 +45,10 @@ const SttSpeechmaticsLanguagesVoices = require('./speech-data/stt-speechmatics')
|
||||
const SttAssemblyaiLanguagesVoices = require('./speech-data/stt-assemblyai');
|
||||
const SttVoxistLanguagesVoices = require('./speech-data/stt-voxist');
|
||||
const SttVerbioLanguagesVoices = require('./speech-data/stt-verbio');
|
||||
const ttsCartesia = require('./speech-data/tts-cartesia');
|
||||
const ttsModelCartesia = require('./speech-data/tts-model-cartesia');
|
||||
const SttOpenaiLanguagesVoices = require('./speech-data/stt-openai');
|
||||
|
||||
|
||||
const SttModelOpenai = require('./speech-data/stt-model-openai');
|
||||
|
||||
|
||||
const testSonioxStt = async(logger, credentials) => {
|
||||
@@ -477,6 +481,43 @@ const testVerbioStt = async(logger, getVerbioAccessToken, credentials) => {
|
||||
}
|
||||
};
|
||||
|
||||
const testOpenAiStt = async(logger, credentials) => {
|
||||
const {api_key} = credentials;
|
||||
try {
|
||||
// Create a FormData object to properly format the multipart request
|
||||
const formData = new FormData();
|
||||
|
||||
// Add the audio file as 'file' field
|
||||
const audioBuffer = fs.readFileSync(`${__dirname}/../../data/test_audio.wav`);
|
||||
const blob = new Blob([audioBuffer], { type: 'audio/wav' });
|
||||
formData.append('file', blob, 'audio.wav');
|
||||
|
||||
// Add the model parameter (required by OpenAI)
|
||||
formData.append('model', 'whisper-1');
|
||||
|
||||
// Make the request using fetch
|
||||
const response = await fetch('https://api.openai.com/v1/audio/transcriptions', {
|
||||
method: 'POST',
|
||||
headers: {
|
||||
'Authorization': `Bearer ${api_key}`,
|
||||
'User-Agent': 'jambonz'
|
||||
},
|
||||
body: formData
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error(`OpenAI API error: ${response.status} ${(await response.json()).error?.message}`);
|
||||
}
|
||||
|
||||
const json = await response.json();
|
||||
logger.debug({json}, 'successfully speech to text from OpenAI');
|
||||
return json;
|
||||
} catch (err) {
|
||||
logger.info({err}, 'OpenAI speech-to-text request failed');
|
||||
throw err;
|
||||
}
|
||||
};
|
||||
|
||||
const testAssemblyStt = async(logger, credentials) => {
|
||||
const {api_key} = credentials;
|
||||
|
||||
@@ -651,6 +692,10 @@ function decryptCredential(obj, credential, logger, isObscureKey = true) {
|
||||
const o = JSON.parse(decrypt(credential));
|
||||
obj.api_key = isObscureKey ? obscureKey(o.api_key) : o.api_key;
|
||||
obj.model_id = o.model_id;
|
||||
} else if ('openai' === obj.vendor) {
|
||||
const o = JSON.parse(decrypt(credential));
|
||||
obj.api_key = isObscureKey ? obscureKey(o.api_key) : o.api_key;
|
||||
obj.model_id = o.model_id;
|
||||
} else if ('verbio' === obj.vendor) {
|
||||
const o = JSON.parse(decrypt(credential));
|
||||
obj.client_id = o.client_id;
|
||||
@@ -714,6 +759,8 @@ async function getLanguagesAndVoicesForVendor(logger, vendor, credential, getTts
|
||||
return await getLanguagesVoicesForVoxist(credential, getTtsVoices, logger);
|
||||
case 'whisper':
|
||||
return await getLanguagesVoicesForWhisper(credential, getTtsVoices, logger);
|
||||
case 'openai':
|
||||
return await getLanguagesVoicesForOpenAi(credential, getTtsVoices, logger);
|
||||
case 'verbio':
|
||||
return await getLanguagesVoicesForVerbio(credential, getTtsVoices, logger);
|
||||
case 'speechmatics':
|
||||
@@ -1014,6 +1061,10 @@ async function getLanguagesVoicesForWhisper(credential) {
|
||||
return tranform(TtsWhisperLanguagesVoices, undefined, TtsModelWhisper);
|
||||
}
|
||||
|
||||
async function getLanguagesVoicesForOpenAi(credential) {
|
||||
return tranform(undefined, SttOpenaiLanguagesVoices, TtsModelOpenai, SttModelOpenai);
|
||||
}
|
||||
|
||||
async function getLanguagesVoicesForVerbio(credentials, getTtsVoices, logger) {
|
||||
const stt = SttVerbioLanguagesVoices.reduce((acc, v) => {
|
||||
if (!v.version || (credentials && credentials.engine_version === v.version)) {
|
||||
@@ -1034,11 +1085,12 @@ async function getLanguagesVoicesForVerbio(credentials, getTtsVoices, logger) {
|
||||
}
|
||||
}
|
||||
|
||||
function tranform(tts, stt, models) {
|
||||
function tranform(tts, stt, models, sttModels) {
|
||||
return {
|
||||
...(tts && {tts}),
|
||||
...(stt && {stt}),
|
||||
...(models && {models})
|
||||
...(models && {models}),
|
||||
...(sttModels && {sttModels})
|
||||
};
|
||||
}
|
||||
|
||||
@@ -1224,7 +1276,7 @@ const testCartesia = async(logger, synthAudio, credentials) => {
|
||||
async function getLanguagesVoicesForCartesia(credential) {
|
||||
if (credential) {
|
||||
const {model_id} = credential;
|
||||
const {languages} = ttsModelCartesia.find((m) => m.value === model_id);
|
||||
const {languages} = TtsModelCartesia.find((m) => m.value === model_id);
|
||||
const voices = await fetchCartesiaVoices(credential);
|
||||
|
||||
const buildVoice = (d) => (
|
||||
@@ -1301,5 +1353,6 @@ module.exports = {
|
||||
getLanguagesAndVoicesForVendor,
|
||||
testSpeechmaticsStt,
|
||||
testCartesia,
|
||||
testVoxistStt
|
||||
testVoxistStt,
|
||||
testOpenAiStt
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user