mirror of
https://github.com/jambonz/jambonz-api-server.git
synced 2025-12-19 05:47:46 +00:00
support mod_playht_tts (#304)
* support mod_playht_tts * wip * wip * wip * wip * wip * update speech utils version
This commit is contained in:
@@ -162,7 +162,7 @@ regex VARCHAR(32) NOT NULL COMMENT 'regex-based pattern match against dialed num
|
||||
description VARCHAR(1024),
|
||||
priority INTEGER NOT NULL COMMENT 'lower priority routes are attempted first',
|
||||
PRIMARY KEY (lcr_route_sid)
|
||||
) COMMENT='An ordered list of digit patterns in an LCR table. The pat';
|
||||
) COMMENT='An ordered list of digit patterns in an LCR table. The patterns are tested in sequence until one matches';
|
||||
|
||||
CREATE TABLE lcr
|
||||
(
|
||||
@@ -173,7 +173,7 @@ default_carrier_set_entry_sid CHAR(36) COMMENT 'default carrier/route to use whe
|
||||
service_provider_sid CHAR(36),
|
||||
account_sid CHAR(36),
|
||||
PRIMARY KEY (lcr_sid)
|
||||
) COMMENT='An LCR (least cost routing) table that is used by a service ';
|
||||
) COMMENT='An LCR (least cost routing) table that is used by a service provider or account to make decisions about routing outbound calls when multiple carriers are available.';
|
||||
|
||||
CREATE TABLE password_settings
|
||||
(
|
||||
@@ -496,7 +496,7 @@ messaging_hook_sid CHAR(36) COMMENT 'webhook to call for inbound SMS/MMS ',
|
||||
app_json TEXT,
|
||||
speech_synthesis_vendor VARCHAR(64) NOT NULL DEFAULT 'google',
|
||||
speech_synthesis_language VARCHAR(12) NOT NULL DEFAULT 'en-US',
|
||||
speech_synthesis_voice VARCHAR(64),
|
||||
speech_synthesis_voice VARCHAR(256),
|
||||
speech_synthesis_label VARCHAR(64),
|
||||
speech_recognizer_vendor VARCHAR(64) NOT NULL DEFAULT 'google',
|
||||
speech_recognizer_language VARCHAR(64) NOT NULL DEFAULT 'en-US',
|
||||
@@ -504,7 +504,7 @@ speech_recognizer_label VARCHAR(64),
|
||||
use_for_fallback_speech BOOLEAN DEFAULT false,
|
||||
fallback_speech_synthesis_vendor VARCHAR(64),
|
||||
fallback_speech_synthesis_language VARCHAR(12),
|
||||
fallback_speech_synthesis_voice VARCHAR(64),
|
||||
fallback_speech_synthesis_voice VARCHAR(256),
|
||||
fallback_speech_synthesis_label VARCHAR(64),
|
||||
fallback_speech_recognizer_vendor VARCHAR(64),
|
||||
fallback_speech_recognizer_language VARCHAR(64),
|
||||
|
||||
@@ -2568,7 +2568,7 @@
|
||||
</SQLField>
|
||||
<SQLField>
|
||||
<name><![CDATA[speech_synthesis_voice]]></name>
|
||||
<type><![CDATA[VARCHAR(64)]]></type>
|
||||
<type><![CDATA[VARCHAR(256)]]></type>
|
||||
<notNull><![CDATA[0]]></notNull>
|
||||
<uid><![CDATA[929D66F0-64B9-4D7C-AB4B-24F131E1178F]]></uid>
|
||||
</SQLField>
|
||||
@@ -2618,7 +2618,7 @@
|
||||
</SQLField>
|
||||
<SQLField>
|
||||
<name><![CDATA[fallback_speech_synthesis_voice]]></name>
|
||||
<type><![CDATA[VARCHAR(64)]]></type>
|
||||
<type><![CDATA[VARCHAR(256)]]></type>
|
||||
<notNull><![CDATA[0]]></notNull>
|
||||
<uid><![CDATA[6A0E92C9-32B9-4179-A893-3DADF5DD7728]]></uid>
|
||||
</SQLField>
|
||||
@@ -3108,7 +3108,7 @@
|
||||
<RightSidebarWidth><![CDATA[1235.000000]]></RightSidebarWidth>
|
||||
<sidebarIndex><![CDATA[2]]></sidebarIndex>
|
||||
<snapToGrid><![CDATA[0]]></snapToGrid>
|
||||
<SourceSidebarWidth><![CDATA[0.000000]]></SourceSidebarWidth>
|
||||
<SourceSidebarWidth><![CDATA[312.000000]]></SourceSidebarWidth>
|
||||
<SQLEditorFileFormatVersion><![CDATA[4]]></SQLEditorFileFormatVersion>
|
||||
<uid><![CDATA[58C99A00-06C9-478C-A667-C63842E088F3]]></uid>
|
||||
<windowHeight><![CDATA[870.000000]]></windowHeight>
|
||||
|
||||
@@ -194,6 +194,8 @@ const sql = {
|
||||
],
|
||||
9000: [
|
||||
'ALTER TABLE sip_gateways ADD COLUMN send_options_ping BOOLEAN NOT NULL DEFAULT 0',
|
||||
'ALTER TABLE applications MODIFY COLUMN speech_synthesis_voice VARCHAR(256)',
|
||||
'ALTER TABLE applications MODIFY COLUMN fallback_speech_synthesis_voice VARCHAR(256)',
|
||||
]
|
||||
};
|
||||
|
||||
|
||||
@@ -6,7 +6,8 @@ const sysError = require('../error');
|
||||
const {decrypt, encrypt} = require('../../utils/encrypt-decrypt');
|
||||
const {parseAccountSid, parseServiceProviderSid, parseSpeechCredentialSid} = require('./utils');
|
||||
const {decryptCredential, testWhisper, testDeepgramTTS,
|
||||
getLanguagesAndVoicesForVendor} = require('../../utils/speech-utils');
|
||||
getLanguagesAndVoicesForVendor,
|
||||
testPlayHT} = require('../../utils/speech-utils');
|
||||
const {DbErrorUnprocessableRequest, DbErrorForbidden, DbErrorBadRequest} = require('../../utils/errors');
|
||||
const {
|
||||
testGoogleTts,
|
||||
@@ -135,6 +136,8 @@ const encryptCredential = (obj) => {
|
||||
auth_token = '',
|
||||
cobalt_server_uri,
|
||||
model_id,
|
||||
user_id,
|
||||
voice_engine,
|
||||
options
|
||||
} = obj;
|
||||
|
||||
@@ -219,6 +222,13 @@ const encryptCredential = (obj) => {
|
||||
const elevenlabsData = JSON.stringify({api_key, model_id, options});
|
||||
return encrypt(elevenlabsData);
|
||||
|
||||
case 'playht':
|
||||
assert(api_key, 'invalid playht speech credential: api_key is required');
|
||||
assert(user_id, 'invalid playht speech credential: user_id is required');
|
||||
assert(voice_engine, 'invalid voice_engine speech credential: voice_engine is required');
|
||||
const playhtData = JSON.stringify({api_key, user_id, voice_engine, options});
|
||||
return encrypt(playhtData);
|
||||
|
||||
case 'assemblyai':
|
||||
assert(api_key, 'invalid assemblyai speech credential: api_key is required');
|
||||
const assemblyaiData = JSON.stringify({api_key});
|
||||
@@ -418,6 +428,7 @@ router.put('/:sid', async(req, res) => {
|
||||
custom_tts_url,
|
||||
cobalt_server_uri,
|
||||
model_id,
|
||||
voice_engine,
|
||||
options,
|
||||
deepgram_stt_uri,
|
||||
deepgram_stt_use_tls,
|
||||
@@ -443,6 +454,7 @@ router.put('/:sid', async(req, res) => {
|
||||
custom_tts_url,
|
||||
cobalt_server_uri,
|
||||
model_id,
|
||||
voice_engine,
|
||||
options,
|
||||
deepgram_stt_uri,
|
||||
deepgram_stt_use_tls,
|
||||
@@ -724,6 +736,17 @@ router.get('/:sid/test', async(req, res) => {
|
||||
SpeechCredential.ttsTestResult(sid, false);
|
||||
}
|
||||
}
|
||||
} else if (cred.vendor === 'playht') {
|
||||
if (cred.use_for_tts) {
|
||||
try {
|
||||
await testPlayHT(logger, synthAudio, credential);
|
||||
results.tts.status = 'ok';
|
||||
SpeechCredential.ttsTestResult(sid, true);
|
||||
} catch (err) {
|
||||
results.tts = {status: 'fail', reason: err.message};
|
||||
SpeechCredential.ttsTestResult(sid, false);
|
||||
}
|
||||
}
|
||||
} else if (cred.vendor === 'assemblyai') {
|
||||
const {api_key} = credential;
|
||||
if (cred.use_for_stt) {
|
||||
|
||||
6
lib/utils/speech-data/tts-model-playht.js
Normal file
6
lib/utils/speech-data/tts-model-playht.js
Normal file
@@ -0,0 +1,6 @@
|
||||
module.exports = [
|
||||
{ name: 'PlayHT2.0-turbo', value: 'PlayHT2.0-turbo' },
|
||||
{ name: 'PlayHT2.0', value: 'PlayHT2.0' },
|
||||
{ name: 'PlayHT1.0', value: 'PlayHT1.0' },
|
||||
];
|
||||
|
||||
@@ -21,6 +21,7 @@ const TtsWhisperLanguagesVoices = require('./speech-data/tts-whisper');
|
||||
const TtsModelDeepgram = require('./speech-data/tts-model-deepgram');
|
||||
const TtsModelElevenLabs = require('./speech-data/tts-model-elevenlabs');
|
||||
const TtsModelWhisper = require('./speech-data/tts-model-whisper');
|
||||
const TtsModelPlayHT = require('./speech-data/tts-model-playht');
|
||||
|
||||
const SttGoogleLanguagesVoices = require('./speech-data/stt-google');
|
||||
const SttAwsLanguagesVoices = require('./speech-data/stt-aws');
|
||||
@@ -240,6 +241,27 @@ const testElevenlabs = async(logger, credentials) => {
|
||||
}
|
||||
};
|
||||
|
||||
const testPlayHT = async(logger, synthAudio, credentials) => {
|
||||
try {
|
||||
await synthAudio(
|
||||
{
|
||||
increment: () => {},
|
||||
histogram: () => {}
|
||||
},
|
||||
{
|
||||
vendor: 'playht',
|
||||
credentials,
|
||||
language: 'en-US',
|
||||
voice: 's3://voice-cloning-zero-shot/d9ff78ba-d016-47f6-b0ef-dd630f59414e/female-cs/manifest.json',
|
||||
text: 'Hi there and welcome to jambones!'
|
||||
}
|
||||
);
|
||||
} catch (err) {
|
||||
logger.info({err}, 'synth Playht returned error');
|
||||
throw err;
|
||||
}
|
||||
};
|
||||
|
||||
const testWhisper = async(logger, synthAudio, credentials) => {
|
||||
try {
|
||||
await synthAudio({increment: () => {}, histogram: () => {}},
|
||||
@@ -428,6 +450,12 @@ function decryptCredential(obj, credential, logger, isObscureKey = true) {
|
||||
obj.api_key = isObscureKey ? obscureKey(o.api_key) : o.api_key;
|
||||
obj.model_id = o.model_id;
|
||||
obj.options = o.options;
|
||||
} else if ('playht' === obj.vendor) {
|
||||
const o = JSON.parse(decrypt(credential));
|
||||
obj.api_key = isObscureKey ? obscureKey(o.api_key) : o.api_key;
|
||||
obj.user_id = o.user_id;
|
||||
obj.voice_engine = o.voice_engine;
|
||||
obj.options = o.options;
|
||||
} else if (obj.vendor.startsWith('custom:')) {
|
||||
const o = JSON.parse(decrypt(credential));
|
||||
obj.auth_token = isObscureKey ? obscureKey(o.auth_token) : o.auth_token;
|
||||
@@ -488,6 +516,8 @@ async function getLanguagesAndVoicesForVendor(logger, vendor, credential, getTts
|
||||
return await getLanguagesVoicesForSoniox(credential, getTtsVoices, logger);
|
||||
case 'elevenlabs':
|
||||
return await getLanguagesVoicesForElevenlabs(credential, getTtsVoices, logger);
|
||||
case 'playht':
|
||||
return await getLanguagesVoicesForPlayHT(credential, getTtsVoices, logger);
|
||||
case 'assemblyai':
|
||||
return await getLanguagesVoicesForAssemblyAI(credential, getTtsVoices, logger);
|
||||
case 'whisper':
|
||||
@@ -645,6 +675,49 @@ async function getLanguagesVoicesForElevenlabs(credential) {
|
||||
}
|
||||
}
|
||||
|
||||
const concat = (a) => {
|
||||
return a ? ` ${a},` : '';
|
||||
};
|
||||
|
||||
async function getLanguagesVoicesForPlayHT(credential) {
|
||||
if (credential) {
|
||||
const get = bent('https://api.play.ht', 'GET', 'json', {
|
||||
'AUTHORIZATION' : credential.api_key,
|
||||
'X-USER-ID': credential.user_id,
|
||||
'Accept': 'application/json'
|
||||
});
|
||||
|
||||
const voices = await get('/api/v2/voices');
|
||||
|
||||
const buildVoice = (d) => {
|
||||
let name = `${d.name} -${concat(d.accent)}${concat(d.age)}${concat(d.gender)}
|
||||
${concat(d.loudness)}${concat(d.style)}${concat(d.tempo)}${concat(d.texture)}` ;
|
||||
name = name.endsWith(',') ? name.slice(0, -1) : name;
|
||||
return {
|
||||
value: `${d.id}`,
|
||||
name
|
||||
};
|
||||
};
|
||||
|
||||
const ttsVoices = voices.reduce((acc, voice) => {
|
||||
const languageCode = voice.language_code;
|
||||
const existingLanguage = acc.find((lang) => lang.value === languageCode);
|
||||
if (existingLanguage) {
|
||||
existingLanguage.voices.push(buildVoice(voice));
|
||||
} else {
|
||||
acc.push({
|
||||
value: voice.language_code,
|
||||
name: voice.language,
|
||||
voices: [buildVoice(voice)]
|
||||
});
|
||||
}
|
||||
return acc;
|
||||
}, []);
|
||||
return tranform(ttsVoices, undefined, TtsModelPlayHT);
|
||||
}
|
||||
return tranform(undefined, undefined, TtsModelPlayHT);
|
||||
}
|
||||
|
||||
async function getLanguagesVoicesForAssemblyAI(credential) {
|
||||
return tranform(undefined, SttAssemblyaiLanguagesVoices);
|
||||
}
|
||||
@@ -796,6 +869,7 @@ module.exports = {
|
||||
testIbmStt,
|
||||
testSonioxStt,
|
||||
testElevenlabs,
|
||||
testPlayHT,
|
||||
testAssemblyStt,
|
||||
testDeepgramTTS,
|
||||
getSpeechCredential,
|
||||
|
||||
8
package-lock.json
generated
8
package-lock.json
generated
@@ -19,7 +19,7 @@
|
||||
"@jambonz/lamejs": "^1.2.2",
|
||||
"@jambonz/mw-registrar": "^0.2.7",
|
||||
"@jambonz/realtimedb-helpers": "^0.8.8",
|
||||
"@jambonz/speech-utils": "^0.0.49",
|
||||
"@jambonz/speech-utils": "^0.0.50",
|
||||
"@jambonz/time-series": "^0.2.8",
|
||||
"@jambonz/verb-specifications": "^0.0.69",
|
||||
"@soniox/soniox-node": "^1.2.2",
|
||||
@@ -2027,9 +2027,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/@jambonz/speech-utils": {
|
||||
"version": "0.0.49",
|
||||
"resolved": "https://registry.npmjs.org/@jambonz/speech-utils/-/speech-utils-0.0.49.tgz",
|
||||
"integrity": "sha512-hIVdgiPJJN2WYm7qlcP8yZv+1w+z38sCmiFNf9xMAAV6pjrDCEeUjrwpLhaFkWqVmkNrHh9PHuzPlkLDNkzhIA==",
|
||||
"version": "0.0.50",
|
||||
"resolved": "https://registry.npmjs.org/@jambonz/speech-utils/-/speech-utils-0.0.50.tgz",
|
||||
"integrity": "sha512-fcMaOuWrBVFh6FKiiurYhnQV71xXmnkyBQmp4OjNd1Zo8Ya+tZMdAJjyHtimjJdgiwJbwDnfdSwKSuz8G9CVkQ==",
|
||||
"dependencies": {
|
||||
"@aws-sdk/client-polly": "^3.496.0",
|
||||
"@aws-sdk/client-sts": "^3.496.0",
|
||||
|
||||
@@ -29,7 +29,7 @@
|
||||
"@jambonz/lamejs": "^1.2.2",
|
||||
"@jambonz/mw-registrar": "^0.2.7",
|
||||
"@jambonz/realtimedb-helpers": "^0.8.8",
|
||||
"@jambonz/speech-utils": "^0.0.49",
|
||||
"@jambonz/speech-utils": "^0.0.50",
|
||||
"@jambonz/time-series": "^0.2.8",
|
||||
"@jambonz/verb-specifications": "^0.0.69",
|
||||
"@soniox/soniox-node": "^1.2.2",
|
||||
|
||||
@@ -10,7 +10,7 @@ networks:
|
||||
|
||||
services:
|
||||
mysql:
|
||||
platform: linux/x86_64
|
||||
# platform: linux/x86_64
|
||||
image: mysql:5.7
|
||||
ports:
|
||||
- "3360:3306"
|
||||
@@ -36,7 +36,7 @@ services:
|
||||
ipv4_address: 172.58.0.3
|
||||
|
||||
influxdb:
|
||||
platform: linux/x86_64
|
||||
# platform: linux/x86_64
|
||||
image: influxdb:1.8
|
||||
ports:
|
||||
- "8086:8086"
|
||||
|
||||
@@ -536,7 +536,7 @@ test('speech credentials tests', async(t) => {
|
||||
model_id: 'eleven_multilingual_v2'
|
||||
}
|
||||
});
|
||||
t.ok(result.statusCode === 201, 'successfully added speech credential for Cobalt');
|
||||
t.ok(result.statusCode === 201, 'successfully added speech credential for elevenlabs');
|
||||
const elevenlabs_sid = result.body.sid;
|
||||
|
||||
/* delete the credential */
|
||||
@@ -544,7 +544,31 @@ test('speech credentials tests', async(t) => {
|
||||
auth: authUser,
|
||||
resolveWithFullResponse: true,
|
||||
});
|
||||
t.ok(result.statusCode === 204, 'successfully deleted speech credential for Cobalt');
|
||||
t.ok(result.statusCode === 204, 'successfully deleted speech credential for elevenlabs');
|
||||
|
||||
/* add a credential for playht */
|
||||
result = await request.post(`/Accounts/${account_sid}/SpeechCredentials`, {
|
||||
resolveWithFullResponse: true,
|
||||
auth: authUser,
|
||||
json: true,
|
||||
body: {
|
||||
vendor: 'playht',
|
||||
use_for_stt: false,
|
||||
use_for_tts: true,
|
||||
api_key: 'asdasdasdasddsadasda',
|
||||
user_id: 'user_id',
|
||||
voice_engine: 'PlayHT2.0-turbo'
|
||||
}
|
||||
});
|
||||
t.ok(result.statusCode === 201, 'successfully added speech credential for playht');
|
||||
const playht_sid = result.body.sid;
|
||||
|
||||
/* delete the credential */
|
||||
result = await request.delete(`/Accounts/${account_sid}/SpeechCredentials/${playht_sid}`, {
|
||||
auth: authUser,
|
||||
resolveWithFullResponse: true,
|
||||
});
|
||||
t.ok(result.statusCode === 204, 'successfully deleted speech credential for playht');
|
||||
|
||||
|
||||
/* add a credential for custom voices google */
|
||||
|
||||
Reference in New Issue
Block a user