Compare commits

...

3 Commits

Author SHA1 Message Date
Hoan Luu Huu
1c55bad04f support openai stt (#402)
* support openai stt

* wip

* wip

* add stt languages for openai
2025-03-28 10:14:50 -04:00
Hoan Luu Huu
32a2bfcdb5 support cartesia sonic-2 model (#403)
* support cartesia sonic-2 model

* wip

* fix typo

---------

Co-authored-by: Dave Horton <daveh@beachdognet.com>
2025-03-28 09:52:01 -04:00
Hoan Luu Huu
becc1636b7 deepgram milti languages (#397) 2025-03-17 21:10:22 -04:00
7 changed files with 213 additions and 143 deletions

View File

@@ -13,7 +13,8 @@ const {decryptCredential, testWhisper, testDeepgramTTS,
testVerbioStt,
testSpeechmaticsStt,
testCartesia,
testVoxistStt} = require('../../utils/speech-utils');
testVoxistStt,
testOpenAiStt} = require('../../utils/speech-utils');
const {DbErrorUnprocessableRequest, DbErrorForbidden, DbErrorBadRequest} = require('../../utils/errors');
const {
testGoogleTts,
@@ -282,6 +283,12 @@ const encryptCredential = (obj) => {
const whisperData = JSON.stringify({api_key, model_id});
return encrypt(whisperData);
case 'openai':
assert(api_key, 'invalid openai speech credential: api_key is required');
assert(model_id, 'invalid openai speech credential: model_id is required');
const openaiData = JSON.stringify({api_key, model_id});
return encrypt(openaiData);
case 'verbio':
assert(engine_version, 'invalid verbio speech credential: client_id is required');
assert(client_id, 'invalid verbio speech credential: client_id is required');
@@ -882,6 +889,17 @@ router.get('/:sid/test', async(req, res) => {
SpeechCredential.ttsTestResult(sid, false);
}
}
} else if (cred.vendor === 'openai') {
if (cred.use_for_stt) {
try {
await testOpenAiStt(logger, credential);
results.stt.status = 'ok';
SpeechCredential.sttTestResult(sid, true);
} catch (err) {
results.stt = {status: 'fail', reason: err.message};
SpeechCredential.sttTestResult(sid, false);
}
}
} else if (cred.vendor === 'verbio') {
if (cred.use_for_tts) {
try {

View File

@@ -1,138 +1,56 @@
module.exports = [
{
name: 'Chinese - general',
value: 'zh',
},
{
name: 'Chinese (China)',
value: 'zh-CN',
},
{
name: 'Chinese (Taiwan)',
value: 'zh-TW',
},
{
name: 'Dutch - general',
value: 'nl',
},
{
name: 'English - general',
value: 'en',
},
{
name: 'English (Australia)',
value: 'en-AU',
},
{
name: 'English (United Kingdom)',
value: 'en-GB',
},
{
name: 'English (India)',
value: 'en-IN',
},
{
name: 'English (New Zealand)',
value: 'en-NZ',
},
{
name: 'English (United States)',
value: 'en-US',
},
{
name: 'French - general',
value: 'fr',
},
{
name: 'French (Canada)',
value: 'fr-CA',
},
{
name: 'German - general',
value: 'de',
},
{
name: 'Hindi - general',
value: 'hi',
},
{
name: 'Hindi (Roman Script)',
value: 'hi-Latin',
},
{
name: 'Indonesian - general',
value: 'in',
},
{
name: 'Italian - general',
value: 'it',
},
{
name: 'Japanese - general',
value: 'ja',
},
{
name: 'Korean - general',
value: 'ko',
},
{
name: 'Norwegian - general',
value: 'no',
},
{
name: 'Polish - general',
value: 'pl',
},
{
name: 'Portuguese - general',
value: 'pt',
},
{
name: 'Portuguese (Brazil)',
value: 'pt-BR',
},
{
name: 'Portuguese (Portugal)',
value: 'pt-PT',
},
{
name: 'Russian - general',
value: 'ru',
},
{
name: 'Spanish - general',
value: 'es',
},
{
name: 'Spanish (Latin America)',
value: 'es-419',
},
{
name: 'Swedish - general',
value: 'sv',
},
{
name: 'Turkish - general',
value: 'tr',
},
{
name: 'Ukrainian - general',
value: 'uk',
},
{
name: 'Flemish - general',
value: 'nl-BE',
},
{
name: 'Danish - general',
value: 'da',
},
{
name: 'Tamil - general',
value: 'ta',
},
{
name: 'Tamasheq - general',
value: 'taq',
},
{ name: 'Multilingual', value: 'multi' },
{ name: 'Bulgarian', value: 'bg' },
{ name: 'Catalan', value: 'ca' },
{ name: 'Chinese (Mandarin, Simplified)', value: 'zh' },
{ name: 'Chinese (Mandarin, Simplified - China)', value: 'zh-CN' },
{ name: 'Chinese (Mandarin, Simplified - Hans)', value: 'zh-Hans' },
{ name: 'Chinese (Mandarin, Traditional)', value: 'zh-TW' },
{ name: 'Chinese (Mandarin, Traditional - Hant)', value: 'zh-Hant' },
{ name: 'Chinese (Cantonese, Traditional - Hong Kong)', value: 'zh-HK' },
{ name: 'Czech', value: 'cs' },
{ name: 'Danish', value: 'da' },
{ name: 'Danish (Denmark)', value: 'da-DK' },
{ name: 'Dutch', value: 'nl' },
{ name: 'English', value: 'en' },
{ name: 'English (United States)', value: 'en-US' },
{ name: 'English (Australia)', value: 'en-AU' },
{ name: 'English (United Kingdom)', value: 'en-GB' },
{ name: 'English (New Zealand)', value: 'en-NZ' },
{ name: 'English (India)', value: 'en-IN' },
{ name: 'Estonian', value: 'et' },
{ name: 'Finnish', value: 'fi' },
{ name: 'Flemish', value: 'nl-BE' },
{ name: 'French', value: 'fr' },
{ name: 'French (Canada)', value: 'fr-CA' },
{ name: 'German', value: 'de' },
{ name: 'German (Switzerland)', value: 'de-CH' },
{ name: 'Greek', value: 'el' },
{ name: 'Hindi', value: 'hi' },
{ name: 'Hungarian', value: 'hu' },
{ name: 'Indonesian', value: 'id' },
{ name: 'Italian', value: 'it' },
{ name: 'Japanese', value: 'ja' },
{ name: 'Korean', value: 'ko' },
{ name: 'Korean (South Korea)', value: 'ko-KR' },
{ name: 'Latvian', value: 'lv' },
{ name: 'Lithuanian', value: 'lt' },
{ name: 'Malay', value: 'ms' },
{ name: 'Norwegian', value: 'no' },
{ name: 'Polish', value: 'pl' },
{ name: 'Portuguese', value: 'pt' },
{ name: 'Portuguese (Brazil)', value: 'pt-BR' },
{ name: 'Portuguese (Portugal)', value: 'pt-PT' },
{ name: 'Romanian', value: 'ro' },
{ name: 'Russian', value: 'ru' },
{ name: 'Slovak', value: 'sk' },
{ name: 'Spanish', value: 'es' },
{ name: 'Spanish (Latin America)', value: 'es-419' },
{ name: 'Swedish', value: 'sv' },
{ name: 'Swedish (Sweden)', value: 'sv-SE' },
{ name: 'Thai', value: 'th' },
{ name: 'Thai (Thailand)', value: 'th-TH' },
{ name: 'Turkish', value: 'tr' },
{ name: 'Ukrainian', value: 'uk' },
{ name: 'Vietnamese', value: 'vi' }
];

View File

@@ -0,0 +1,6 @@
module.exports = [
{ name: 'Whisper', value: 'whisper-1' },
{ name: 'GPT 4o Mini Transcribe', value: 'gpt-4o-mini-transcribe' },
{ name: 'GLT 4o Transcribe', value: 'gpt-4o-transcribe' },
];

View File

@@ -0,0 +1,59 @@
module.exports = [
{ name: 'Afrikaans', value: 'af' },
{ name: 'Arabic', value: 'ar' },
{ name: 'Azerbaijani', value: 'az' },
{ name: 'Belarusian', value: 'be' },
{ name: 'Bulgarian', value: 'bg' },
{ name: 'Bosnian', value: 'bs' },
{ name: 'Catalan', value: 'ca' },
{ name: 'Czech', value: 'cs' },
{ name: 'Welsh', value: 'cy' },
{ name: 'Danish', value: 'da' },
{ name: 'German', value: 'de' },
{ name: 'Greek', value: 'el' },
{ name: 'English', value: 'en' },
{ name: 'Spanish', value: 'es' },
{ name: 'Estonian', value: 'et' },
{ name: 'Persian', value: 'fa' },
{ name: 'Finnish', value: 'fi' },
{ name: 'French', value: 'fr' },
{ name: 'Galician', value: 'gl' },
{ name: 'Hebrew', value: 'he' },
{ name: 'Hindi', value: 'hi' },
{ name: 'Croatian', value: 'hr' },
{ name: 'Hungarian', value: 'hu' },
{ name: 'Armenian', value: 'hy' },
{ name: 'Indonesian', value: 'id' },
{ name: 'Icelandic', value: 'is' },
{ name: 'Italian', value: 'it' },
{ name: 'Japanese', value: 'ja' },
{ name: 'Kazakh', value: 'kk' },
{ name: 'Kannada', value: 'kn' },
{ name: 'Korean', value: 'ko' },
{ name: 'Lithuanian', value: 'lt' },
{ name: 'Latvian', value: 'lv' },
{ name: 'Maori', value: 'mi' },
{ name: 'Macedonian', value: 'mk' },
{ name: 'Marathi', value: 'mr' },
{ name: 'Malay', value: 'ms' },
{ name: 'Nepali', value: 'ne' },
{ name: 'Dutch', value: 'nl' },
{ name: 'Norwegian', value: 'no' },
{ name: 'Polish', value: 'pl' },
{ name: 'Portuguese', value: 'pt' },
{ name: 'Romanian', value: 'ro' },
{ name: 'Russian', value: 'ru' },
{ name: 'Slovak', value: 'sk' },
{ name: 'Slovenian', value: 'sl' },
{ name: 'Serbian', value: 'sr' },
{ name: 'Swedish', value: 'sv' },
{ name: 'Swahili', value: 'sw' },
{ name: 'Tamil', value: 'ta' },
{ name: 'Thai', value: 'th' },
{ name: 'Tagalog', value: 'tl' },
{ name: 'Turkish', value: 'tr' },
{ name: 'Ukrainian', value: 'uk' },
{ name: 'Urdu', value: 'ur' },
{ name: 'Vietnamese', value: 'vi' },
{ name: 'Chinese', value: 'zh' },
];

View File

@@ -4,6 +4,16 @@ module.exports = [
value: 'sonic',
languages: ['en', 'fr', 'de', 'es', 'pt', 'zh', 'ja', 'hi', 'it', 'ko', 'nl', 'pl', 'ru', 'sv', 'tr']
},
{
name: 'Sonic 2',
value: 'sonic-2',
languages: ['en', 'fr', 'de', 'es', 'pt', 'zh', 'ja', 'hi', 'it', 'ko', 'nl', 'pl', 'ru', 'sv', 'tr']
},
{
name: 'Sonic Turbo',
value: 'sonic-turbo',
languages: ['en', 'fr', 'de', 'es', 'pt', 'zh', 'ja', 'hi', 'it', 'ko', 'nl', 'pl', 'ru', 'sv', 'tr']
},
{ name: 'Sonic Preview', value: 'sonic-preview', languages: ['en'] },
{
name: 'Sonic 2024-12-12',

View File

@@ -0,0 +1,6 @@
module.exports = [
{ name: 'TTS-1', value: 'tts-1' },
{ name: 'TTS-1-HD', value: 'tts-1-hd' },
{ name: 'GPT-4o-Mini-TTS', value: 'gpt-4o-mini-tts' },
];

View File

@@ -20,6 +20,7 @@ const TtsElevenlabsLanguagesVoices = require('./speech-data/tts-elevenlabs');
const TtsWhisperLanguagesVoices = require('./speech-data/tts-whisper');
const TtsPlayHtLanguagesVoices = require('./speech-data/tts-playht');
const TtsVerbioLanguagesVoices = require('./speech-data/tts-verbio');
const ttsCartesia = require('./speech-data/tts-cartesia');
const TtsModelDeepgram = require('./speech-data/tts-model-deepgram');
const TtsLanguagesDeepgram = require('./speech-data/tts-deepgram');
@@ -29,6 +30,7 @@ const TtsModelPlayHT = require('./speech-data/tts-model-playht');
const ttsLanguagesPlayHt = require('./speech-data/tts-languages-playht');
const TtsModelRimelabs = require('./speech-data/tts-model-rimelabs');
const TtsModelCartesia = require('./speech-data/tts-model-cartesia');
const TtsModelOpenai = require('./speech-data/tts-model-openai');
const SttGoogleLanguagesVoices = require('./speech-data/stt-google');
const SttAwsLanguagesVoices = require('./speech-data/stt-aws');
@@ -43,8 +45,10 @@ const SttSpeechmaticsLanguagesVoices = require('./speech-data/stt-speechmatics')
const SttAssemblyaiLanguagesVoices = require('./speech-data/stt-assemblyai');
const SttVoxistLanguagesVoices = require('./speech-data/stt-voxist');
const SttVerbioLanguagesVoices = require('./speech-data/stt-verbio');
const ttsCartesia = require('./speech-data/tts-cartesia');
const ttsModelCartesia = require('./speech-data/tts-model-cartesia');
const SttOpenaiLanguagesVoices = require('./speech-data/stt-openai');
const SttModelOpenai = require('./speech-data/stt-model-openai');
const testSonioxStt = async(logger, credentials) => {
@@ -477,6 +481,43 @@ const testVerbioStt = async(logger, getVerbioAccessToken, credentials) => {
}
};
const testOpenAiStt = async(logger, credentials) => {
const {api_key} = credentials;
try {
// Create a FormData object to properly format the multipart request
const formData = new FormData();
// Add the audio file as 'file' field
const audioBuffer = fs.readFileSync(`${__dirname}/../../data/test_audio.wav`);
const blob = new Blob([audioBuffer], { type: 'audio/wav' });
formData.append('file', blob, 'audio.wav');
// Add the model parameter (required by OpenAI)
formData.append('model', 'whisper-1');
// Make the request using fetch
const response = await fetch('https://api.openai.com/v1/audio/transcriptions', {
method: 'POST',
headers: {
'Authorization': `Bearer ${api_key}`,
'User-Agent': 'jambonz'
},
body: formData
});
if (!response.ok) {
throw new Error(`OpenAI API error: ${response.status} ${(await response.json()).error?.message}`);
}
const json = await response.json();
logger.debug({json}, 'successfully speech to text from OpenAI');
return json;
} catch (err) {
logger.info({err}, 'OpenAI speech-to-text request failed');
throw err;
}
};
const testAssemblyStt = async(logger, credentials) => {
const {api_key} = credentials;
@@ -651,6 +692,10 @@ function decryptCredential(obj, credential, logger, isObscureKey = true) {
const o = JSON.parse(decrypt(credential));
obj.api_key = isObscureKey ? obscureKey(o.api_key) : o.api_key;
obj.model_id = o.model_id;
} else if ('openai' === obj.vendor) {
const o = JSON.parse(decrypt(credential));
obj.api_key = isObscureKey ? obscureKey(o.api_key) : o.api_key;
obj.model_id = o.model_id;
} else if ('verbio' === obj.vendor) {
const o = JSON.parse(decrypt(credential));
obj.client_id = o.client_id;
@@ -714,6 +759,8 @@ async function getLanguagesAndVoicesForVendor(logger, vendor, credential, getTts
return await getLanguagesVoicesForVoxist(credential, getTtsVoices, logger);
case 'whisper':
return await getLanguagesVoicesForWhisper(credential, getTtsVoices, logger);
case 'openai':
return await getLanguagesVoicesForOpenAi(credential, getTtsVoices, logger);
case 'verbio':
return await getLanguagesVoicesForVerbio(credential, getTtsVoices, logger);
case 'speechmatics':
@@ -1014,6 +1061,10 @@ async function getLanguagesVoicesForWhisper(credential) {
return tranform(TtsWhisperLanguagesVoices, undefined, TtsModelWhisper);
}
async function getLanguagesVoicesForOpenAi(credential) {
return tranform(undefined, SttOpenaiLanguagesVoices, TtsModelOpenai, SttModelOpenai);
}
async function getLanguagesVoicesForVerbio(credentials, getTtsVoices, logger) {
const stt = SttVerbioLanguagesVoices.reduce((acc, v) => {
if (!v.version || (credentials && credentials.engine_version === v.version)) {
@@ -1034,11 +1085,12 @@ async function getLanguagesVoicesForVerbio(credentials, getTtsVoices, logger) {
}
}
function tranform(tts, stt, models) {
function tranform(tts, stt, models, sttModels) {
return {
...(tts && {tts}),
...(stt && {stt}),
...(models && {models})
...(models && {models}),
...(sttModels && {sttModels})
};
}
@@ -1224,7 +1276,7 @@ const testCartesia = async(logger, synthAudio, credentials) => {
async function getLanguagesVoicesForCartesia(credential) {
if (credential) {
const {model_id} = credential;
const {languages} = ttsModelCartesia.find((m) => m.value === model_id);
const {languages} = TtsModelCartesia.find((m) => m.value === model_id);
const voices = await fetchCartesiaVoices(credential);
const buildVoice = (d) => (
@@ -1301,5 +1353,6 @@ module.exports = {
getLanguagesAndVoicesForVendor,
testSpeechmaticsStt,
testCartesia,
testVoxistStt
testVoxistStt,
testOpenAiStt
};