Compare commits

...

10 Commits

Author SHA1 Message Date
Quan HL
a5e423f84f playht should return list of voice match voice engine configured at speech credentials 2024-04-09 16:32:47 +07:00
Hoan Luu Huu
e2c1383723 support mod_playht_tts (#304)
* support mod_playht_tts

* wip

* wip

* wip

* wip

* wip

* update speech utils version
2024-04-08 10:21:29 -04:00
Dave Horton
40de2c5945 option_ping was incorrectly removed, adding back (#305) 2024-04-08 08:56:31 -04:00
Dave Horton
3a299bc3ca update to speech utils with azure 1.36.0 (#303) 2024-04-07 17:45:33 -04:00
Dave Horton
70c9407742 update to speech utils with azure 1.36.0 2024-04-07 12:16:55 -04:00
Dave Horton
dba66d58fc back out column addition of -register_use_tls 2024-04-06 13:48:26 -04:00
Dave Horton
0ff3d22faf Revert "feat send options ping for sip gateway (#273)"
This reverts commit a4792a521f.
2024-04-06 13:27:32 -04:00
Hoan Luu Huu
187a428a75 register use tls (#302) 2024-04-04 08:02:29 -04:00
Hoan Luu Huu
a4792a521f feat send options ping for sip gateway (#273)
* feat send options ping for sip gateway

* update upgrade db script to have 8006
2024-03-30 09:14:29 -04:00
Dave Horton
3ac9693735 update speech-utils with fixes for deepgram production api and tts streaming 2024-03-24 08:15:00 -04:00
12 changed files with 2999 additions and 15744 deletions

View File

@@ -162,7 +162,7 @@ regex VARCHAR(32) NOT NULL COMMENT 'regex-based pattern match against dialed num
description VARCHAR(1024),
priority INTEGER NOT NULL COMMENT 'lower priority routes are attempted first',
PRIMARY KEY (lcr_route_sid)
) COMMENT='An ordered list of digit patterns in an LCR table. The pat';
) COMMENT='An ordered list of digit patterns in an LCR table. The patterns are tested in sequence until one matches';
CREATE TABLE lcr
(
@@ -173,7 +173,7 @@ default_carrier_set_entry_sid CHAR(36) COMMENT 'default carrier/route to use whe
service_provider_sid CHAR(36),
account_sid CHAR(36),
PRIMARY KEY (lcr_sid)
) COMMENT='An LCR (least cost routing) table that is used by a service ';
) COMMENT='An LCR (least cost routing) table that is used by a service provider or account to make decisions about routing outbound calls when multiple carriers are available.';
CREATE TABLE password_settings
(
@@ -458,6 +458,7 @@ inbound BOOLEAN NOT NULL COMMENT 'if true, whitelist this IP to allow inbound ca
outbound BOOLEAN NOT NULL COMMENT 'if true, include in least-cost routing when placing calls to the PSTN',
voip_carrier_sid CHAR(36) NOT NULL,
is_active BOOLEAN NOT NULL DEFAULT 1,
send_options_ping BOOLEAN NOT NULL DEFAULT 0,
pad_crypto BOOLEAN NOT NULL DEFAULT 0,
protocol ENUM('udp','tcp','tls', 'tls/srtp') DEFAULT 'udp' COMMENT 'Outbound call protocol',
PRIMARY KEY (sip_gateway_sid)
@@ -495,7 +496,7 @@ messaging_hook_sid CHAR(36) COMMENT 'webhook to call for inbound SMS/MMS ',
app_json TEXT,
speech_synthesis_vendor VARCHAR(64) NOT NULL DEFAULT 'google',
speech_synthesis_language VARCHAR(12) NOT NULL DEFAULT 'en-US',
speech_synthesis_voice VARCHAR(64),
speech_synthesis_voice VARCHAR(256),
speech_synthesis_label VARCHAR(64),
speech_recognizer_vendor VARCHAR(64) NOT NULL DEFAULT 'google',
speech_recognizer_language VARCHAR(64) NOT NULL DEFAULT 'en-US',
@@ -503,7 +504,7 @@ speech_recognizer_label VARCHAR(64),
use_for_fallback_speech BOOLEAN DEFAULT false,
fallback_speech_synthesis_vendor VARCHAR(64),
fallback_speech_synthesis_language VARCHAR(12),
fallback_speech_synthesis_voice VARCHAR(64),
fallback_speech_synthesis_voice VARCHAR(256),
fallback_speech_synthesis_label VARCHAR(64),
fallback_speech_recognizer_vendor VARCHAR(64),
fallback_speech_recognizer_language VARCHAR(64),

View File

@@ -551,7 +551,7 @@
</location>
<size>
<width>293.00</width>
<height>540.00</height>
<height>560.00</height>
</size>
<zorder>6</zorder>
<SQLField>
@@ -2332,7 +2332,7 @@
</location>
<size>
<width>281.00</width>
<height>240.00</height>
<height>260.00</height>
</size>
<zorder>7</zorder>
<SQLField>
@@ -2399,10 +2399,18 @@
<notNull><![CDATA[1]]></notNull>
<uid><![CDATA[27D4A5BD-8093-4ADD-B5B5-D546844206F9]]></uid>
</SQLField>
<SQLField>
<name><![CDATA[send_options_ping]]></name>
<type><![CDATA[BOOLEAN]]></type>
<defaultValue><![CDATA[0]]></defaultValue>
<notNull><![CDATA[1]]></notNull>
<uid><![CDATA[E04C19A2-12BF-443F-AB61-96990224A18D]]></uid>
</SQLField>
<SQLField>
<name><![CDATA[pad_crypto]]></name>
<type><![CDATA[BOOLEAN]]></type>
<defaultValue><![CDATA[0]]></defaultValue>
<forcedUnique><![CDATA[0]]></forcedUnique>
<notNull><![CDATA[1]]></notNull>
<uid><![CDATA[C5C0043B-100A-4476-BF01-BE0777AE27C0]]></uid>
</SQLField>
@@ -2560,7 +2568,7 @@
</SQLField>
<SQLField>
<name><![CDATA[speech_synthesis_voice]]></name>
<type><![CDATA[VARCHAR(64)]]></type>
<type><![CDATA[VARCHAR(256)]]></type>
<notNull><![CDATA[0]]></notNull>
<uid><![CDATA[929D66F0-64B9-4D7C-AB4B-24F131E1178F]]></uid>
</SQLField>
@@ -2610,7 +2618,7 @@
</SQLField>
<SQLField>
<name><![CDATA[fallback_speech_synthesis_voice]]></name>
<type><![CDATA[VARCHAR(64)]]></type>
<type><![CDATA[VARCHAR(256)]]></type>
<notNull><![CDATA[0]]></notNull>
<uid><![CDATA[6A0E92C9-32B9-4179-A893-3DADF5DD7728]]></uid>
</SQLField>
@@ -3097,17 +3105,17 @@
<overviewPanelHidden><![CDATA[0]]></overviewPanelHidden>
<pageBoundariesVisible><![CDATA[0]]></pageBoundariesVisible>
<PageGridVisible><![CDATA[0]]></PageGridVisible>
<RightSidebarWidth><![CDATA[1924.000000]]></RightSidebarWidth>
<RightSidebarWidth><![CDATA[1235.000000]]></RightSidebarWidth>
<sidebarIndex><![CDATA[2]]></sidebarIndex>
<snapToGrid><![CDATA[0]]></snapToGrid>
<SourceSidebarWidth><![CDATA[0.000000]]></SourceSidebarWidth>
<SourceSidebarWidth><![CDATA[312.000000]]></SourceSidebarWidth>
<SQLEditorFileFormatVersion><![CDATA[4]]></SQLEditorFileFormatVersion>
<uid><![CDATA[58C99A00-06C9-478C-A667-C63842E088F3]]></uid>
<windowHeight><![CDATA[985.000000]]></windowHeight>
<windowLocationX><![CDATA[-1307.000000]]></windowLocationX>
<windowLocationY><![CDATA[1008.000000]]></windowLocationY>
<windowScrollOrigin><![CDATA[{1.5, 786}]]></windowScrollOrigin>
<windowWidth><![CDATA[2201.000000]]></windowWidth>
<windowHeight><![CDATA[870.000000]]></windowHeight>
<windowLocationX><![CDATA[-1164.000000]]></windowLocationX>
<windowLocationY><![CDATA[1131.000000]]></windowLocationY>
<windowScrollOrigin><![CDATA[{0.5, 0}]]></windowScrollOrigin>
<windowWidth><![CDATA[1512.000000]]></windowWidth>
</SQLDocumentInfo>
<AllowsIndexRenamingOnInsert><![CDATA[1]]></AllowsIndexRenamingOnInsert>
<defaultLabelExpanded><![CDATA[1]]></defaultLabelExpanded>

View File

@@ -190,7 +190,12 @@ const sql = {
'ALTER TABLE google_custom_voices ADD FOREIGN KEY speech_credential_sid_idxfk (speech_credential_sid) REFERENCES speech_credentials (speech_credential_sid) ON DELETE CASCADE',
'ALTER TABLE clients ADD COLUMN allow_direct_queue_calling BOOLEAN NOT NULL DEFAULT 1',
'ALTER TABLE clients ADD COLUMN allow_direct_user_calling BOOLEAN NOT NULL DEFAULT 1',
'ALTER TABLE clients ADD COLUMN allow_direct_app_calling BOOLEAN NOT NULL DEFAULT 1'
'ALTER TABLE clients ADD COLUMN allow_direct_app_calling BOOLEAN NOT NULL DEFAULT 1',
],
9000: [
'ALTER TABLE sip_gateways ADD COLUMN send_options_ping BOOLEAN NOT NULL DEFAULT 0',
'ALTER TABLE applications MODIFY COLUMN speech_synthesis_voice VARCHAR(256)',
'ALTER TABLE applications MODIFY COLUMN fallback_speech_synthesis_voice VARCHAR(256)',
]
};
@@ -223,6 +228,7 @@ const doIt = async() => {
if (val < 8003) upgrades.push(...sql['8003']);
if (val < 8004) upgrades.push(...sql['8004']);
if (val < 8005) upgrades.push(...sql['8005']);
if (val < 9000) upgrades.push(...sql['9000']);
// perform all upgrades
logger.info({upgrades}, 'applying schema upgrades..');

View File

@@ -61,6 +61,10 @@ VoipCarrier.fields = [
name: 'requires_register',
type: 'number'
},
{
name: 'register_use_tls',
type: 'number'
},
{
name: 'register_username',
type: 'string'

View File

@@ -6,7 +6,8 @@ const sysError = require('../error');
const {decrypt, encrypt} = require('../../utils/encrypt-decrypt');
const {parseAccountSid, parseServiceProviderSid, parseSpeechCredentialSid} = require('./utils');
const {decryptCredential, testWhisper, testDeepgramTTS,
getLanguagesAndVoicesForVendor} = require('../../utils/speech-utils');
getLanguagesAndVoicesForVendor,
testPlayHT} = require('../../utils/speech-utils');
const {DbErrorUnprocessableRequest, DbErrorForbidden, DbErrorBadRequest} = require('../../utils/errors');
const {
testGoogleTts,
@@ -135,6 +136,8 @@ const encryptCredential = (obj) => {
auth_token = '',
cobalt_server_uri,
model_id,
user_id,
voice_engine,
options
} = obj;
@@ -219,6 +222,13 @@ const encryptCredential = (obj) => {
const elevenlabsData = JSON.stringify({api_key, model_id, options});
return encrypt(elevenlabsData);
case 'playht':
assert(api_key, 'invalid playht speech credential: api_key is required');
assert(user_id, 'invalid playht speech credential: user_id is required');
assert(voice_engine, 'invalid voice_engine speech credential: voice_engine is required');
const playhtData = JSON.stringify({api_key, user_id, voice_engine, options});
return encrypt(playhtData);
case 'assemblyai':
assert(api_key, 'invalid assemblyai speech credential: api_key is required');
const assemblyaiData = JSON.stringify({api_key});
@@ -418,6 +428,7 @@ router.put('/:sid', async(req, res) => {
custom_tts_url,
cobalt_server_uri,
model_id,
voice_engine,
options,
deepgram_stt_uri,
deepgram_stt_use_tls,
@@ -443,6 +454,7 @@ router.put('/:sid', async(req, res) => {
custom_tts_url,
cobalt_server_uri,
model_id,
voice_engine,
options,
deepgram_stt_uri,
deepgram_stt_use_tls,
@@ -724,6 +736,17 @@ router.get('/:sid/test', async(req, res) => {
SpeechCredential.ttsTestResult(sid, false);
}
}
} else if (cred.vendor === 'playht') {
if (cred.use_for_tts) {
try {
await testPlayHT(logger, synthAudio, credential);
results.tts.status = 'ok';
SpeechCredential.ttsTestResult(sid, true);
} catch (err) {
results.tts = {status: 'fail', reason: err.message};
SpeechCredential.ttsTestResult(sid, false);
}
}
} else if (cred.vendor === 'assemblyai') {
const {api_key} = credential;
if (cred.use_for_stt) {

View File

@@ -1108,6 +1108,9 @@ paths:
requires_register:
type: boolean
description: wehther this provider requires us to send a REGISTER to them in order to receive calls
register_use_tls:
type: boolean
description: wehther this provider requires us to send a REGISTER use TLS protocol
register_username:
type: string
description: sip username to authenticate with, if registration is required

View File

@@ -0,0 +1,6 @@
module.exports = [
{ name: 'PlayHT2.0-turbo', value: 'PlayHT2.0-turbo' },
{ name: 'PlayHT2.0', value: 'PlayHT2.0' },
{ name: 'PlayHT1.0', value: 'PlayHT1.0' },
];

View File

@@ -21,6 +21,7 @@ const TtsWhisperLanguagesVoices = require('./speech-data/tts-whisper');
const TtsModelDeepgram = require('./speech-data/tts-model-deepgram');
const TtsModelElevenLabs = require('./speech-data/tts-model-elevenlabs');
const TtsModelWhisper = require('./speech-data/tts-model-whisper');
const TtsModelPlayHT = require('./speech-data/tts-model-playht');
const SttGoogleLanguagesVoices = require('./speech-data/stt-google');
const SttAwsLanguagesVoices = require('./speech-data/stt-aws');
@@ -240,6 +241,27 @@ const testElevenlabs = async(logger, credentials) => {
}
};
const testPlayHT = async(logger, synthAudio, credentials) => {
try {
await synthAudio(
{
increment: () => {},
histogram: () => {}
},
{
vendor: 'playht',
credentials,
language: 'en-US',
voice: 's3://voice-cloning-zero-shot/d9ff78ba-d016-47f6-b0ef-dd630f59414e/female-cs/manifest.json',
text: 'Hi there and welcome to jambones!'
}
);
} catch (err) {
logger.info({err}, 'synth Playht returned error');
throw err;
}
};
const testWhisper = async(logger, synthAudio, credentials) => {
try {
await synthAudio({increment: () => {}, histogram: () => {}},
@@ -428,6 +450,12 @@ function decryptCredential(obj, credential, logger, isObscureKey = true) {
obj.api_key = isObscureKey ? obscureKey(o.api_key) : o.api_key;
obj.model_id = o.model_id;
obj.options = o.options;
} else if ('playht' === obj.vendor) {
const o = JSON.parse(decrypt(credential));
obj.api_key = isObscureKey ? obscureKey(o.api_key) : o.api_key;
obj.user_id = o.user_id;
obj.voice_engine = o.voice_engine;
obj.options = o.options;
} else if (obj.vendor.startsWith('custom:')) {
const o = JSON.parse(decrypt(credential));
obj.auth_token = isObscureKey ? obscureKey(o.auth_token) : o.auth_token;
@@ -488,6 +516,8 @@ async function getLanguagesAndVoicesForVendor(logger, vendor, credential, getTts
return await getLanguagesVoicesForSoniox(credential, getTtsVoices, logger);
case 'elevenlabs':
return await getLanguagesVoicesForElevenlabs(credential, getTtsVoices, logger);
case 'playht':
return await getLanguagesVoicesForPlayHT(credential, getTtsVoices, logger);
case 'assemblyai':
return await getLanguagesVoicesForAssemblyAI(credential, getTtsVoices, logger);
case 'whisper':
@@ -645,6 +675,52 @@ async function getLanguagesVoicesForElevenlabs(credential) {
}
}
const concat = (a) => {
return a ? ` ${a},` : '';
};
async function getLanguagesVoicesForPlayHT(credential) {
if (credential) {
const get = bent('https://api.play.ht', 'GET', 'json', {
'AUTHORIZATION' : credential.api_key,
'X-USER-ID': credential.user_id,
'Accept': 'application/json'
});
const voices = await get('/api/v2/voices');
const buildVoice = (d) => {
let name = `${d.name} -${concat(d.accent)}${concat(d.age)}${concat(d.gender)}
${concat(d.loudness)}${concat(d.style)}${concat(d.tempo)}${concat(d.texture)}` ;
name = name.endsWith(',') ? name.slice(0, -1) : name;
return {
value: `${d.id}`,
name
};
};
const ttsVoices = voices.reduce((acc, voice) => {
if (!credential.voice_engine.includes(voice.voice_engine)) {
return acc;
}
const languageCode = voice.language_code;
const existingLanguage = acc.find((lang) => lang.value === languageCode);
if (existingLanguage) {
existingLanguage.voices.push(buildVoice(voice));
} else {
acc.push({
value: voice.language_code,
name: voice.language,
voices: [buildVoice(voice)]
});
}
return acc;
}, []);
return tranform(ttsVoices, undefined, TtsModelPlayHT);
}
return tranform(undefined, undefined, TtsModelPlayHT);
}
async function getLanguagesVoicesForAssemblyAI(credential) {
return tranform(undefined, SttAssemblyaiLanguagesVoices);
}
@@ -796,6 +872,7 @@ module.exports = {
testIbmStt,
testSonioxStt,
testElevenlabs,
testPlayHT,
testAssemblyStt,
testDeepgramTTS,
getSpeechCredential,

18487
package-lock.json generated

File diff suppressed because it is too large Load Diff

View File

@@ -1,6 +1,6 @@
{
"name": "jambonz-api-server",
"version": "0.8.5",
"version": "0.9.0",
"description": "",
"main": "app.js",
"scripts": {
@@ -19,54 +19,54 @@
"url": "https://github.com/jambonz/jambonz-api-server.git"
},
"dependencies": {
"@aws-sdk/client-s3": "^3.363.0",
"@aws-sdk/client-transcribe": "^3.363.0",
"@azure/storage-blob": "^12.15.0",
"@aws-sdk/client-s3": "^3.550.0",
"@aws-sdk/client-transcribe": "^3.549.0",
"@azure/storage-blob": "^12.17.0",
"@deepgram/sdk": "^1.21.0",
"@google-cloud/speech": "^5.2.0",
"@google-cloud/storage": "^6.12.0",
"@google-cloud/speech": "^6.5.0",
"@google-cloud/storage": "^7.9.0",
"@jambonz/db-helpers": "^0.9.3",
"@jambonz/lamejs": "^1.2.2",
"@jambonz/mw-registrar": "^0.2.7",
"@jambonz/realtimedb-helpers": "^0.8.7",
"@jambonz/speech-utils": "^0.0.41",
"@jambonz/realtimedb-helpers": "^0.8.8",
"@jambonz/speech-utils": "^0.0.50",
"@jambonz/time-series": "^0.2.8",
"@jambonz/verb-specifications": "^0.0.45",
"@soniox/soniox-node": "^1.1.1",
"argon2": "^0.30.3",
"assemblyai": "^3.0.1",
"@jambonz/verb-specifications": "^0.0.69",
"@soniox/soniox-node": "^1.2.2",
"argon2": "^0.40.1",
"assemblyai": "^4.3.4",
"bent": "^7.3.12",
"cors": "^2.8.5",
"debug": "^4.3.4",
"express": "^4.18.1",
"express-rate-limit": "^6.4.0",
"form-data": "^2.5.1",
"helmet": "^5.1.0",
"ibm-watson": "^7.1.2",
"jsonwebtoken": "^9.0.0",
"mailgun.js": "^9.1.2",
"microsoft-cognitiveservices-speech-sdk": "1.31.0",
"mysql2": "^2.3.3",
"nocache": "3.0.4",
"passport": "^0.6.0",
"express": "^4.19.2",
"express-rate-limit": "^7.2.0",
"form-data": "^4.0.0",
"helmet": "^7.1.0",
"ibm-watson": "^9.0.1",
"jsonwebtoken": "^9.0.2",
"mailgun.js": "^10.2.1",
"microsoft-cognitiveservices-speech-sdk": "1.36.0",
"mysql2": "^3.9.3",
"nocache": "4.0.0",
"passport": "^0.7.0",
"passport-http-bearer": "^1.0.1",
"pino": "^5.17.0",
"short-uuid": "^4.1.0",
"pino": "^8.20.0",
"short-uuid": "^4.2.2",
"stream-buffers": "^3.0.2",
"stripe": "^8.222.0",
"swagger-ui-express": "^4.4.0",
"uuid": "^8.3.2",
"stripe": "^14.24.0",
"swagger-ui-express": "^5.0.0",
"uuid": "^9.0.1",
"wav": "^1.0.2",
"ws": "^8.12.1",
"ws": "^8.16.0",
"yamljs": "^0.3.0"
},
"devDependencies": {
"eslint": "^8.39.0",
"eslint-plugin-promise": "^6.1.1",
"husky": "7.0.4",
"husky": "9.0.11",
"nyc": "^15.1.0",
"request": "^2.88.2",
"request-promise-native": "^1.0.9",
"tape": "^5.5.3"
"tape": "^5.7.5"
}
}

View File

@@ -10,7 +10,7 @@ networks:
services:
mysql:
platform: linux/x86_64
# platform: linux/x86_64
image: mysql:5.7
ports:
- "3360:3306"
@@ -36,7 +36,7 @@ services:
ipv4_address: 172.58.0.3
influxdb:
platform: linux/x86_64
# platform: linux/x86_64
image: influxdb:1.8
ports:
- "8086:8086"

View File

@@ -536,7 +536,7 @@ test('speech credentials tests', async(t) => {
model_id: 'eleven_multilingual_v2'
}
});
t.ok(result.statusCode === 201, 'successfully added speech credential for Cobalt');
t.ok(result.statusCode === 201, 'successfully added speech credential for elevenlabs');
const elevenlabs_sid = result.body.sid;
/* delete the credential */
@@ -544,7 +544,31 @@ test('speech credentials tests', async(t) => {
auth: authUser,
resolveWithFullResponse: true,
});
t.ok(result.statusCode === 204, 'successfully deleted speech credential for Cobalt');
t.ok(result.statusCode === 204, 'successfully deleted speech credential for elevenlabs');
/* add a credential for playht */
result = await request.post(`/Accounts/${account_sid}/SpeechCredentials`, {
resolveWithFullResponse: true,
auth: authUser,
json: true,
body: {
vendor: 'playht',
use_for_stt: false,
use_for_tts: true,
api_key: 'asdasdasdasddsadasda',
user_id: 'user_id',
voice_engine: 'PlayHT2.0-turbo'
}
});
t.ok(result.statusCode === 201, 'successfully added speech credential for playht');
const playht_sid = result.body.sid;
/* delete the credential */
result = await request.delete(`/Accounts/${account_sid}/SpeechCredentials/${playht_sid}`, {
auth: authUser,
resolveWithFullResponse: true,
});
t.ok(result.statusCode === 204, 'successfully deleted speech credential for playht');
/* add a credential for custom voices google */