support mod_playht_tts (#304)

* support mod_playht_tts

* wip

* wip

* wip

* wip

* wip

* update speech utils version
This commit is contained in:
Hoan Luu Huu
2024-04-08 21:21:29 +07:00
committed by GitHub
parent 40de2c5945
commit e2c1383723
10 changed files with 146 additions and 17 deletions

View File

@@ -162,7 +162,7 @@ regex VARCHAR(32) NOT NULL COMMENT 'regex-based pattern match against dialed num
description VARCHAR(1024), description VARCHAR(1024),
priority INTEGER NOT NULL COMMENT 'lower priority routes are attempted first', priority INTEGER NOT NULL COMMENT 'lower priority routes are attempted first',
PRIMARY KEY (lcr_route_sid) PRIMARY KEY (lcr_route_sid)
) COMMENT='An ordered list of digit patterns in an LCR table. The pat'; ) COMMENT='An ordered list of digit patterns in an LCR table. The patterns are tested in sequence until one matches';
CREATE TABLE lcr CREATE TABLE lcr
( (
@@ -173,7 +173,7 @@ default_carrier_set_entry_sid CHAR(36) COMMENT 'default carrier/route to use whe
service_provider_sid CHAR(36), service_provider_sid CHAR(36),
account_sid CHAR(36), account_sid CHAR(36),
PRIMARY KEY (lcr_sid) PRIMARY KEY (lcr_sid)
) COMMENT='An LCR (least cost routing) table that is used by a service '; ) COMMENT='An LCR (least cost routing) table that is used by a service provider or account to make decisions about routing outbound calls when multiple carriers are available.';
CREATE TABLE password_settings CREATE TABLE password_settings
( (
@@ -496,7 +496,7 @@ messaging_hook_sid CHAR(36) COMMENT 'webhook to call for inbound SMS/MMS ',
app_json TEXT, app_json TEXT,
speech_synthesis_vendor VARCHAR(64) NOT NULL DEFAULT 'google', speech_synthesis_vendor VARCHAR(64) NOT NULL DEFAULT 'google',
speech_synthesis_language VARCHAR(12) NOT NULL DEFAULT 'en-US', speech_synthesis_language VARCHAR(12) NOT NULL DEFAULT 'en-US',
speech_synthesis_voice VARCHAR(64), speech_synthesis_voice VARCHAR(256),
speech_synthesis_label VARCHAR(64), speech_synthesis_label VARCHAR(64),
speech_recognizer_vendor VARCHAR(64) NOT NULL DEFAULT 'google', speech_recognizer_vendor VARCHAR(64) NOT NULL DEFAULT 'google',
speech_recognizer_language VARCHAR(64) NOT NULL DEFAULT 'en-US', speech_recognizer_language VARCHAR(64) NOT NULL DEFAULT 'en-US',
@@ -504,7 +504,7 @@ speech_recognizer_label VARCHAR(64),
use_for_fallback_speech BOOLEAN DEFAULT false, use_for_fallback_speech BOOLEAN DEFAULT false,
fallback_speech_synthesis_vendor VARCHAR(64), fallback_speech_synthesis_vendor VARCHAR(64),
fallback_speech_synthesis_language VARCHAR(12), fallback_speech_synthesis_language VARCHAR(12),
fallback_speech_synthesis_voice VARCHAR(64), fallback_speech_synthesis_voice VARCHAR(256),
fallback_speech_synthesis_label VARCHAR(64), fallback_speech_synthesis_label VARCHAR(64),
fallback_speech_recognizer_vendor VARCHAR(64), fallback_speech_recognizer_vendor VARCHAR(64),
fallback_speech_recognizer_language VARCHAR(64), fallback_speech_recognizer_language VARCHAR(64),

View File

@@ -2568,7 +2568,7 @@
</SQLField> </SQLField>
<SQLField> <SQLField>
<name><![CDATA[speech_synthesis_voice]]></name> <name><![CDATA[speech_synthesis_voice]]></name>
<type><![CDATA[VARCHAR(64)]]></type> <type><![CDATA[VARCHAR(256)]]></type>
<notNull><![CDATA[0]]></notNull> <notNull><![CDATA[0]]></notNull>
<uid><![CDATA[929D66F0-64B9-4D7C-AB4B-24F131E1178F]]></uid> <uid><![CDATA[929D66F0-64B9-4D7C-AB4B-24F131E1178F]]></uid>
</SQLField> </SQLField>
@@ -2618,7 +2618,7 @@
</SQLField> </SQLField>
<SQLField> <SQLField>
<name><![CDATA[fallback_speech_synthesis_voice]]></name> <name><![CDATA[fallback_speech_synthesis_voice]]></name>
<type><![CDATA[VARCHAR(64)]]></type> <type><![CDATA[VARCHAR(256)]]></type>
<notNull><![CDATA[0]]></notNull> <notNull><![CDATA[0]]></notNull>
<uid><![CDATA[6A0E92C9-32B9-4179-A893-3DADF5DD7728]]></uid> <uid><![CDATA[6A0E92C9-32B9-4179-A893-3DADF5DD7728]]></uid>
</SQLField> </SQLField>
@@ -3108,7 +3108,7 @@
<RightSidebarWidth><![CDATA[1235.000000]]></RightSidebarWidth> <RightSidebarWidth><![CDATA[1235.000000]]></RightSidebarWidth>
<sidebarIndex><![CDATA[2]]></sidebarIndex> <sidebarIndex><![CDATA[2]]></sidebarIndex>
<snapToGrid><![CDATA[0]]></snapToGrid> <snapToGrid><![CDATA[0]]></snapToGrid>
<SourceSidebarWidth><![CDATA[0.000000]]></SourceSidebarWidth> <SourceSidebarWidth><![CDATA[312.000000]]></SourceSidebarWidth>
<SQLEditorFileFormatVersion><![CDATA[4]]></SQLEditorFileFormatVersion> <SQLEditorFileFormatVersion><![CDATA[4]]></SQLEditorFileFormatVersion>
<uid><![CDATA[58C99A00-06C9-478C-A667-C63842E088F3]]></uid> <uid><![CDATA[58C99A00-06C9-478C-A667-C63842E088F3]]></uid>
<windowHeight><![CDATA[870.000000]]></windowHeight> <windowHeight><![CDATA[870.000000]]></windowHeight>

View File

@@ -194,6 +194,8 @@ const sql = {
], ],
9000: [ 9000: [
'ALTER TABLE sip_gateways ADD COLUMN send_options_ping BOOLEAN NOT NULL DEFAULT 0', 'ALTER TABLE sip_gateways ADD COLUMN send_options_ping BOOLEAN NOT NULL DEFAULT 0',
'ALTER TABLE applications MODIFY COLUMN speech_synthesis_voice VARCHAR(256)',
'ALTER TABLE applications MODIFY COLUMN fallback_speech_synthesis_voice VARCHAR(256)',
] ]
}; };

View File

@@ -6,7 +6,8 @@ const sysError = require('../error');
const {decrypt, encrypt} = require('../../utils/encrypt-decrypt'); const {decrypt, encrypt} = require('../../utils/encrypt-decrypt');
const {parseAccountSid, parseServiceProviderSid, parseSpeechCredentialSid} = require('./utils'); const {parseAccountSid, parseServiceProviderSid, parseSpeechCredentialSid} = require('./utils');
const {decryptCredential, testWhisper, testDeepgramTTS, const {decryptCredential, testWhisper, testDeepgramTTS,
getLanguagesAndVoicesForVendor} = require('../../utils/speech-utils'); getLanguagesAndVoicesForVendor,
testPlayHT} = require('../../utils/speech-utils');
const {DbErrorUnprocessableRequest, DbErrorForbidden, DbErrorBadRequest} = require('../../utils/errors'); const {DbErrorUnprocessableRequest, DbErrorForbidden, DbErrorBadRequest} = require('../../utils/errors');
const { const {
testGoogleTts, testGoogleTts,
@@ -135,6 +136,8 @@ const encryptCredential = (obj) => {
auth_token = '', auth_token = '',
cobalt_server_uri, cobalt_server_uri,
model_id, model_id,
user_id,
voice_engine,
options options
} = obj; } = obj;
@@ -219,6 +222,13 @@ const encryptCredential = (obj) => {
const elevenlabsData = JSON.stringify({api_key, model_id, options}); const elevenlabsData = JSON.stringify({api_key, model_id, options});
return encrypt(elevenlabsData); return encrypt(elevenlabsData);
case 'playht':
assert(api_key, 'invalid playht speech credential: api_key is required');
assert(user_id, 'invalid playht speech credential: user_id is required');
assert(voice_engine, 'invalid voice_engine speech credential: voice_engine is required');
const playhtData = JSON.stringify({api_key, user_id, voice_engine, options});
return encrypt(playhtData);
case 'assemblyai': case 'assemblyai':
assert(api_key, 'invalid assemblyai speech credential: api_key is required'); assert(api_key, 'invalid assemblyai speech credential: api_key is required');
const assemblyaiData = JSON.stringify({api_key}); const assemblyaiData = JSON.stringify({api_key});
@@ -418,6 +428,7 @@ router.put('/:sid', async(req, res) => {
custom_tts_url, custom_tts_url,
cobalt_server_uri, cobalt_server_uri,
model_id, model_id,
voice_engine,
options, options,
deepgram_stt_uri, deepgram_stt_uri,
deepgram_stt_use_tls, deepgram_stt_use_tls,
@@ -443,6 +454,7 @@ router.put('/:sid', async(req, res) => {
custom_tts_url, custom_tts_url,
cobalt_server_uri, cobalt_server_uri,
model_id, model_id,
voice_engine,
options, options,
deepgram_stt_uri, deepgram_stt_uri,
deepgram_stt_use_tls, deepgram_stt_use_tls,
@@ -724,6 +736,17 @@ router.get('/:sid/test', async(req, res) => {
SpeechCredential.ttsTestResult(sid, false); SpeechCredential.ttsTestResult(sid, false);
} }
} }
} else if (cred.vendor === 'playht') {
if (cred.use_for_tts) {
try {
await testPlayHT(logger, synthAudio, credential);
results.tts.status = 'ok';
SpeechCredential.ttsTestResult(sid, true);
} catch (err) {
results.tts = {status: 'fail', reason: err.message};
SpeechCredential.ttsTestResult(sid, false);
}
}
} else if (cred.vendor === 'assemblyai') { } else if (cred.vendor === 'assemblyai') {
const {api_key} = credential; const {api_key} = credential;
if (cred.use_for_stt) { if (cred.use_for_stt) {

View File

@@ -0,0 +1,6 @@
module.exports = [
{ name: 'PlayHT2.0-turbo', value: 'PlayHT2.0-turbo' },
{ name: 'PlayHT2.0', value: 'PlayHT2.0' },
{ name: 'PlayHT1.0', value: 'PlayHT1.0' },
];

View File

@@ -21,6 +21,7 @@ const TtsWhisperLanguagesVoices = require('./speech-data/tts-whisper');
const TtsModelDeepgram = require('./speech-data/tts-model-deepgram'); const TtsModelDeepgram = require('./speech-data/tts-model-deepgram');
const TtsModelElevenLabs = require('./speech-data/tts-model-elevenlabs'); const TtsModelElevenLabs = require('./speech-data/tts-model-elevenlabs');
const TtsModelWhisper = require('./speech-data/tts-model-whisper'); const TtsModelWhisper = require('./speech-data/tts-model-whisper');
const TtsModelPlayHT = require('./speech-data/tts-model-playht');
const SttGoogleLanguagesVoices = require('./speech-data/stt-google'); const SttGoogleLanguagesVoices = require('./speech-data/stt-google');
const SttAwsLanguagesVoices = require('./speech-data/stt-aws'); const SttAwsLanguagesVoices = require('./speech-data/stt-aws');
@@ -240,6 +241,27 @@ const testElevenlabs = async(logger, credentials) => {
} }
}; };
const testPlayHT = async(logger, synthAudio, credentials) => {
try {
await synthAudio(
{
increment: () => {},
histogram: () => {}
},
{
vendor: 'playht',
credentials,
language: 'en-US',
voice: 's3://voice-cloning-zero-shot/d9ff78ba-d016-47f6-b0ef-dd630f59414e/female-cs/manifest.json',
text: 'Hi there and welcome to jambones!'
}
);
} catch (err) {
logger.info({err}, 'synth Playht returned error');
throw err;
}
};
const testWhisper = async(logger, synthAudio, credentials) => { const testWhisper = async(logger, synthAudio, credentials) => {
try { try {
await synthAudio({increment: () => {}, histogram: () => {}}, await synthAudio({increment: () => {}, histogram: () => {}},
@@ -428,6 +450,12 @@ function decryptCredential(obj, credential, logger, isObscureKey = true) {
obj.api_key = isObscureKey ? obscureKey(o.api_key) : o.api_key; obj.api_key = isObscureKey ? obscureKey(o.api_key) : o.api_key;
obj.model_id = o.model_id; obj.model_id = o.model_id;
obj.options = o.options; obj.options = o.options;
} else if ('playht' === obj.vendor) {
const o = JSON.parse(decrypt(credential));
obj.api_key = isObscureKey ? obscureKey(o.api_key) : o.api_key;
obj.user_id = o.user_id;
obj.voice_engine = o.voice_engine;
obj.options = o.options;
} else if (obj.vendor.startsWith('custom:')) { } else if (obj.vendor.startsWith('custom:')) {
const o = JSON.parse(decrypt(credential)); const o = JSON.parse(decrypt(credential));
obj.auth_token = isObscureKey ? obscureKey(o.auth_token) : o.auth_token; obj.auth_token = isObscureKey ? obscureKey(o.auth_token) : o.auth_token;
@@ -488,6 +516,8 @@ async function getLanguagesAndVoicesForVendor(logger, vendor, credential, getTts
return await getLanguagesVoicesForSoniox(credential, getTtsVoices, logger); return await getLanguagesVoicesForSoniox(credential, getTtsVoices, logger);
case 'elevenlabs': case 'elevenlabs':
return await getLanguagesVoicesForElevenlabs(credential, getTtsVoices, logger); return await getLanguagesVoicesForElevenlabs(credential, getTtsVoices, logger);
case 'playht':
return await getLanguagesVoicesForPlayHT(credential, getTtsVoices, logger);
case 'assemblyai': case 'assemblyai':
return await getLanguagesVoicesForAssemblyAI(credential, getTtsVoices, logger); return await getLanguagesVoicesForAssemblyAI(credential, getTtsVoices, logger);
case 'whisper': case 'whisper':
@@ -645,6 +675,49 @@ async function getLanguagesVoicesForElevenlabs(credential) {
} }
} }
const concat = (a) => {
return a ? ` ${a},` : '';
};
async function getLanguagesVoicesForPlayHT(credential) {
if (credential) {
const get = bent('https://api.play.ht', 'GET', 'json', {
'AUTHORIZATION' : credential.api_key,
'X-USER-ID': credential.user_id,
'Accept': 'application/json'
});
const voices = await get('/api/v2/voices');
const buildVoice = (d) => {
let name = `${d.name} -${concat(d.accent)}${concat(d.age)}${concat(d.gender)}
${concat(d.loudness)}${concat(d.style)}${concat(d.tempo)}${concat(d.texture)}` ;
name = name.endsWith(',') ? name.slice(0, -1) : name;
return {
value: `${d.id}`,
name
};
};
const ttsVoices = voices.reduce((acc, voice) => {
const languageCode = voice.language_code;
const existingLanguage = acc.find((lang) => lang.value === languageCode);
if (existingLanguage) {
existingLanguage.voices.push(buildVoice(voice));
} else {
acc.push({
value: voice.language_code,
name: voice.language,
voices: [buildVoice(voice)]
});
}
return acc;
}, []);
return tranform(ttsVoices, undefined, TtsModelPlayHT);
}
return tranform(undefined, undefined, TtsModelPlayHT);
}
async function getLanguagesVoicesForAssemblyAI(credential) { async function getLanguagesVoicesForAssemblyAI(credential) {
return tranform(undefined, SttAssemblyaiLanguagesVoices); return tranform(undefined, SttAssemblyaiLanguagesVoices);
} }
@@ -796,6 +869,7 @@ module.exports = {
testIbmStt, testIbmStt,
testSonioxStt, testSonioxStt,
testElevenlabs, testElevenlabs,
testPlayHT,
testAssemblyStt, testAssemblyStt,
testDeepgramTTS, testDeepgramTTS,
getSpeechCredential, getSpeechCredential,

8
package-lock.json generated
View File

@@ -19,7 +19,7 @@
"@jambonz/lamejs": "^1.2.2", "@jambonz/lamejs": "^1.2.2",
"@jambonz/mw-registrar": "^0.2.7", "@jambonz/mw-registrar": "^0.2.7",
"@jambonz/realtimedb-helpers": "^0.8.8", "@jambonz/realtimedb-helpers": "^0.8.8",
"@jambonz/speech-utils": "^0.0.49", "@jambonz/speech-utils": "^0.0.50",
"@jambonz/time-series": "^0.2.8", "@jambonz/time-series": "^0.2.8",
"@jambonz/verb-specifications": "^0.0.69", "@jambonz/verb-specifications": "^0.0.69",
"@soniox/soniox-node": "^1.2.2", "@soniox/soniox-node": "^1.2.2",
@@ -2027,9 +2027,9 @@
} }
}, },
"node_modules/@jambonz/speech-utils": { "node_modules/@jambonz/speech-utils": {
"version": "0.0.49", "version": "0.0.50",
"resolved": "https://registry.npmjs.org/@jambonz/speech-utils/-/speech-utils-0.0.49.tgz", "resolved": "https://registry.npmjs.org/@jambonz/speech-utils/-/speech-utils-0.0.50.tgz",
"integrity": "sha512-hIVdgiPJJN2WYm7qlcP8yZv+1w+z38sCmiFNf9xMAAV6pjrDCEeUjrwpLhaFkWqVmkNrHh9PHuzPlkLDNkzhIA==", "integrity": "sha512-fcMaOuWrBVFh6FKiiurYhnQV71xXmnkyBQmp4OjNd1Zo8Ya+tZMdAJjyHtimjJdgiwJbwDnfdSwKSuz8G9CVkQ==",
"dependencies": { "dependencies": {
"@aws-sdk/client-polly": "^3.496.0", "@aws-sdk/client-polly": "^3.496.0",
"@aws-sdk/client-sts": "^3.496.0", "@aws-sdk/client-sts": "^3.496.0",

View File

@@ -29,7 +29,7 @@
"@jambonz/lamejs": "^1.2.2", "@jambonz/lamejs": "^1.2.2",
"@jambonz/mw-registrar": "^0.2.7", "@jambonz/mw-registrar": "^0.2.7",
"@jambonz/realtimedb-helpers": "^0.8.8", "@jambonz/realtimedb-helpers": "^0.8.8",
"@jambonz/speech-utils": "^0.0.49", "@jambonz/speech-utils": "^0.0.50",
"@jambonz/time-series": "^0.2.8", "@jambonz/time-series": "^0.2.8",
"@jambonz/verb-specifications": "^0.0.69", "@jambonz/verb-specifications": "^0.0.69",
"@soniox/soniox-node": "^1.2.2", "@soniox/soniox-node": "^1.2.2",

View File

@@ -10,7 +10,7 @@ networks:
services: services:
mysql: mysql:
platform: linux/x86_64 # platform: linux/x86_64
image: mysql:5.7 image: mysql:5.7
ports: ports:
- "3360:3306" - "3360:3306"
@@ -36,7 +36,7 @@ services:
ipv4_address: 172.58.0.3 ipv4_address: 172.58.0.3
influxdb: influxdb:
platform: linux/x86_64 # platform: linux/x86_64
image: influxdb:1.8 image: influxdb:1.8
ports: ports:
- "8086:8086" - "8086:8086"

View File

@@ -536,7 +536,7 @@ test('speech credentials tests', async(t) => {
model_id: 'eleven_multilingual_v2' model_id: 'eleven_multilingual_v2'
} }
}); });
t.ok(result.statusCode === 201, 'successfully added speech credential for Cobalt'); t.ok(result.statusCode === 201, 'successfully added speech credential for elevenlabs');
const elevenlabs_sid = result.body.sid; const elevenlabs_sid = result.body.sid;
/* delete the credential */ /* delete the credential */
@@ -544,7 +544,31 @@ test('speech credentials tests', async(t) => {
auth: authUser, auth: authUser,
resolveWithFullResponse: true, resolveWithFullResponse: true,
}); });
t.ok(result.statusCode === 204, 'successfully deleted speech credential for Cobalt'); t.ok(result.statusCode === 204, 'successfully deleted speech credential for elevenlabs');
/* add a credential for playht */
result = await request.post(`/Accounts/${account_sid}/SpeechCredentials`, {
resolveWithFullResponse: true,
auth: authUser,
json: true,
body: {
vendor: 'playht',
use_for_stt: false,
use_for_tts: true,
api_key: 'asdasdasdasddsadasda',
user_id: 'user_id',
voice_engine: 'PlayHT2.0-turbo'
}
});
t.ok(result.statusCode === 201, 'successfully added speech credential for playht');
const playht_sid = result.body.sid;
/* delete the credential */
result = await request.delete(`/Accounts/${account_sid}/SpeechCredentials/${playht_sid}`, {
auth: authUser,
resolveWithFullResponse: true,
});
t.ok(result.statusCode === 204, 'successfully deleted speech credential for playht');
/* add a credential for custom voices google */ /* add a credential for custom voices google */