Tts/elevenlabs streaming (#629)

* update to fsmrf with fix

* changes to support elevenlabs tts streaming

* say: add vendor data to span

* bug: tts spans must include cached property

* add env for JAMBONES_USE_FREESWITCH_TIMER_FD

* fix bug in prev commit

* wip

* linting

* wip - caching files generating by streaming tts

* wip caching

* cleanup some logs

* handle tts streaming failure, write alert

* update node version dependency

* set timerfd on outbound call scenarios

* default model to nova-2-phonecall when using deepgram

---------

Co-authored-by: Dave Horton <daveh@beachdognet.com>
This commit is contained in:
Hoan Luu Huu
2024-02-07 20:49:36 +07:00
committed by GitHub
parent 48a81072e8
commit a55f81676b
10 changed files with 2932 additions and 4138 deletions

View File

@@ -132,6 +132,8 @@ const JAMBONES_DISABLE_DIRECT_P2P_CALL = process.env.JAMBONES_DISABLE_DIRECT_P2P
const JAMBONES_EAGERLY_PRE_CACHE_AUDIO = process.env.JAMBONES_EAGERLY_PRE_CACHE_AUDIO; const JAMBONES_EAGERLY_PRE_CACHE_AUDIO = process.env.JAMBONES_EAGERLY_PRE_CACHE_AUDIO;
const JAMBONES_USE_FREESWITCH_TIMER_FD = process.env.JAMBONES_USE_FREESWITCH_TIMER_FD;
module.exports = { module.exports = {
JAMBONES_MYSQL_HOST, JAMBONES_MYSQL_HOST,
JAMBONES_MYSQL_USER, JAMBONES_MYSQL_USER,
@@ -213,5 +215,6 @@ module.exports = {
JAMBONZ_RECORD_WS_USERNAME, JAMBONZ_RECORD_WS_USERNAME,
JAMBONZ_RECORD_WS_PASSWORD, JAMBONZ_RECORD_WS_PASSWORD,
JAMBONZ_DISABLE_DIAL_PAI_HEADER, JAMBONZ_DISABLE_DIAL_PAI_HEADER,
JAMBONES_DISABLE_DIRECT_P2P_CALL JAMBONES_DISABLE_DIRECT_P2P_CALL,
JAMBONES_USE_FREESWITCH_TIMER_FD
}; };

View File

@@ -21,6 +21,7 @@ const {
JAMBONES_INJECT_CONTENT, JAMBONES_INJECT_CONTENT,
JAMBONES_EAGERLY_PRE_CACHE_AUDIO, JAMBONES_EAGERLY_PRE_CACHE_AUDIO,
AWS_REGION, AWS_REGION,
JAMBONES_USE_FREESWITCH_TIMER_FD
} = require('../config'); } = require('../config');
const BackgroundTaskManager = require('../utils/background-task-manager'); const BackgroundTaskManager = require('../utils/background-task-manager');
const BADPRECONDITIONS = 'preconditions not met'; const BADPRECONDITIONS = 'preconditions not met';
@@ -689,7 +690,7 @@ class CallSession extends Emitter {
(type === 'stt' && credential.use_for_stt) (type === 'stt' && credential.use_for_stt)
)) { )) {
this.logger.info( this.logger.info(
`Speech vendor: ${credential.vendor} ${credential.label ? `, label: ${credential.label}` : ''} selected`); `${type}: ${credential.vendor} ${credential.label ? `, label: ${credential.label}` : ''} `);
if ('google' === vendor) { if ('google' === vendor) {
if (type === 'tts' && !credential.tts_tested_ok || if (type === 'tts' && !credential.tts_tested_ok ||
type === 'stt' && !credential.stt_tested_ok) { type === 'stt' && !credential.stt_tested_ok) {
@@ -2033,8 +2034,12 @@ Duration=${duration} `
} }
_configMsEndpoint() { _configMsEndpoint() {
if (this.onHoldMusic) { const opts = {
this.ep.set({hold_music: `shout://${this.onHoldMusic.replace(/^https?:\/\//, '')}`}); ...(this.onHoldMusic && {holdMusic: `shout://${this.onHoldMusic.replace(/^https?:\/\//, '')}`}),
...(JAMBONES_USE_FREESWITCH_TIMER_FD && {timer_name: 'timerfd'})
};
if (Object.keys(opts).length > 0) {
this.ep.set(opts);
} }
} }

View File

@@ -123,8 +123,6 @@ class TaskListen extends Task {
ci, ci,
this.metadata); this.metadata);
if (this.hook.auth) { if (this.hook.auth) {
this.logger.debug({username: this.hook.auth.username, password: this.hook.auth.password},
'TaskListen:_startListening basic auth');
await this.ep.set({ await this.ep.set({
'MOD_AUDIO_BASIC_AUTH_USERNAME': this.hook.auth.username, 'MOD_AUDIO_BASIC_AUTH_USERNAME': this.hook.auth.username,
'MOD_AUDIO_BASIC_AUTH_PASSWORD': this.hook.auth.password 'MOD_AUDIO_BASIC_AUTH_PASSWORD': this.hook.auth.password

View File

@@ -23,6 +23,12 @@ const breakLengthyTextIfNeeded = (logger, text) => {
} }
}; };
const parseTextFromSayString = (text) => {
const closingBraceIndex = text.indexOf('}');
if (closingBraceIndex === -1) return text;
return text.slice(closingBraceIndex + 1);
};
class TaskSay extends Task { class TaskSay extends Task {
constructor(logger, opts, parentTask) { constructor(logger, opts, parentTask) {
super(logger, opts); super(logger, opts);
@@ -60,7 +66,7 @@ class TaskSay extends Task {
} }
async _synthesizeWithSpecificVendor(cs, ep, {vendor, language, voice, label, preCache = false}) { async _synthesizeWithSpecificVendor(cs, ep, {vendor, language, voice, label, preCache = false}) {
const {srf} = cs; const {srf, accountSid:account_sid} = cs;
const {updateSpeechCredentialLastUsed} = require('../utils/db-utils')(this.logger, srf); const {updateSpeechCredentialLastUsed} = require('../utils/db-utils')(this.logger, srf);
const {writeAlerts, AlertType, stats} = srf.locals; const {writeAlerts, AlertType, stats} = srf.locals;
const {synthAudio} = srf.locals.dbHelpers; const {synthAudio} = srf.locals.dbHelpers;
@@ -97,11 +103,17 @@ class TaskSay extends Task {
voice = this.options.voice_id || voice; voice = this.options.voice_id || voice;
} }
this.ep.set({
tts_engine: vendor,
tts_voice: voice,
cache_speech_handles: 1,
}).catch((err) => this.logger.info({err}, 'Error setting tts_engine on endpoint'));
if (!preCache) this.logger.info({vendor, language, voice, model}, 'TaskSay:exec'); if (!preCache) this.logger.info({vendor, language, voice, model}, 'TaskSay:exec');
try { try {
if (!credentials) { if (!credentials) {
writeAlerts({ writeAlerts({
account_sid: cs.accountSid, account_sid,
alert_type: AlertType.TTS_NOT_PROVISIONED, alert_type: AlertType.TTS_NOT_PROVISIONED,
vendor vendor
}).catch((err) => this.logger.info({err}, 'Error generating alert for no tts')); }).catch((err) => this.logger.info({err}, 'Error generating alert for no tts'));
@@ -120,18 +132,17 @@ class TaskSay extends Task {
if (text.startsWith('silence_stream://')) return text; if (text.startsWith('silence_stream://')) return text;
/* otel: trace time for tts */ /* otel: trace time for tts */
let otelSpan;
if (!preCache) { if (!preCache) {
const {span} = this.startChildSpan('tts-generation', { const {span} = this.startChildSpan('tts-generation', {
'tts.vendor': vendor, 'tts.vendor': vendor,
'tts.language': language, 'tts.language': language,
'tts.voice': voice 'tts.voice': voice
}); });
otelSpan = span; this.otelSpan = span;
} }
try { try {
const {filePath, servedFromCache, rtt} = await synthAudio(stats, { const {filePath, servedFromCache, rtt} = await synthAudio(stats, {
account_sid: cs.accountSid, account_sid,
text, text,
vendor, vendor,
language, language,
@@ -141,30 +152,40 @@ class TaskSay extends Task {
salt, salt,
credentials, credentials,
options: this.options, options: this.options,
disableTtsCache : this.disableTtsCache disableTtsCache : this.disableTtsCache,
preCache
}); });
this.logger.debug(`file ${filePath}, served from cache ${servedFromCache}`); if (!filePath.startsWith('say:')) {
if (filePath) cs.trackTmpFile(filePath); this.logger.debug(`file ${filePath}, served from cache ${servedFromCache}`);
if (!servedFromCache && !lastUpdated) { if (filePath) cs.trackTmpFile(filePath);
lastUpdated = true; if (this.otelSpan) {
updateSpeechCredentialLastUsed(credentials.speech_credential_sid) this.otelSpan.setAttributes({'tts.cached': servedFromCache});
.catch(() => {/*already logged error */}); this.otelSpan.end();
this.otelSpan = null;
}
if (!servedFromCache && !lastUpdated) {
lastUpdated = true;
updateSpeechCredentialLastUsed(credentials.speech_credential_sid).catch(() => {/* logged error */});
}
if (!servedFromCache && rtt && !preCache) {
this.notifyStatus({
event: 'synthesized-audio',
vendor,
language,
characters: text.length,
elapsedTime: rtt
});
}
} }
if (otelSpan) otelSpan.setAttributes({'tts.cached': servedFromCache}); else {
if (otelSpan) otelSpan.end(); this.logger.debug('a streaming tts api will be used');
if (!servedFromCache && rtt && !preCache) { const modifiedPath = filePath.replace('say:{', `say:{session-uuid=${this.ep.uuid},`);
this.notifyStatus({ return modifiedPath;
event: 'synthesized-audio',
vendor,
language,
characters: text.length,
elapsedTime: rtt
});
} }
return filePath; return filePath;
} catch (err) { } catch (err) {
this.logger.info({err}, 'Error synthesizing tts'); this.logger.info({err}, 'Error synthesizing tts');
if (otelSpan) otelSpan.end(); if (this.otelSpan) this.otelSpan.end();
writeAlerts({ writeAlerts({
account_sid: cs.accountSid, account_sid: cs.accountSid,
alert_type: AlertType.TTS_FAILURE, alert_type: AlertType.TTS_FAILURE,
@@ -186,6 +207,11 @@ class TaskSay extends Task {
} }
async exec(cs, {ep}) { async exec(cs, {ep}) {
const {srf, accountSid:account_sid} = cs;
const {writeAlerts, AlertType} = srf.locals;
const {addFileToCache} = srf.locals.dbHelpers;
const engine = this.synthesizer.engine || 'standard';
await super.exec(cs); await super.exec(cs);
this.ep = ep; this.ep = ep;
@@ -243,7 +269,39 @@ class TaskSay extends Task {
await this.playToConfMember(this.ep, memberId, confName, confUuid, filepath[segment]); await this.playToConfMember(this.ep, memberId, confName, confUuid, filepath[segment]);
} }
else { else {
this.logger.debug(`Say:exec sending command to play file ${filepath[segment]}`); this.logger.debug(`Say:exec sending ${filepath[segment].substring(0, 64)}`);
this.ep.once('playback-start', (evt) => {
this.logger.debug({evt}, 'got playback-start');
if (this.otelSpan) {
this.logger.debug({evt}, 'got playback-start');
this._addStreamingTtsAttributes(this.otelSpan, evt);
this.otelSpan.end();
this.otelSpan = null;
if (evt.variable_tts_cache_filename) cs.trackTmpFile(evt.variable_tts_cache_filename);
}
});
this.ep.once('playback-stop', (evt) => {
this.logger.debug({evt}, 'got playback-stop');
if (evt.variable_tts_error) {
writeAlerts({
account_sid,
alert_type: AlertType.TTS_FAILURE,
vendor,
detail: evt.variable_tts_error
}).catch((err) => this.logger.info({err}, 'Error generating alert for no tts'));
}
if (evt.variable_tts_cache_filename) {
const text = parseTextFromSayString(this.text[segment]);
addFileToCache(evt.variable_tts_cache_filename, {
account_sid,
vendor,
language,
voice,
engine,
text
}).catch((err) => this.logger.info({err}, 'Error adding file to cache'));
}
});
await ep.play(filepath[segment]); await ep.play(filepath[segment]);
this.logger.debug(`Say:exec completed play file ${filepath[segment]}`); this.logger.debug(`Say:exec completed play file ${filepath[segment]}`);
} }
@@ -265,8 +323,30 @@ class TaskSay extends Task {
this.notifyStatus({event: 'kill-playback'}); this.notifyStatus({event: 'kill-playback'});
this.ep.api('uuid_break', this.ep.uuid); this.ep.api('uuid_break', this.ep.uuid);
} }
this.ep.removeAllListeners('playback-start');
this.ep.removeAllListeners('playback-stop');
} }
} }
_addStreamingTtsAttributes(span, evt) {
const attrs = {'tts.cached': false};
for (const [key, value] of Object.entries(evt)) {
if (key.startsWith('variable_tts_')) {
let newKey = key.substring('variable_tts_'.length)
.replace('elevenlabs_', 'elevenlabs.');
if (spanMapping[newKey]) newKey = spanMapping[newKey];
attrs[newKey] = value;
}
}
span.setAttributes(attrs);
}
} }
const spanMapping = {
'elevenlabs.reported_latency_ms': 'elevenlabs.latency_ms',
'elevenlabs.request_id': 'elevenlabs.req_id',
'elevenlabs.history_item_id': 'elevenlabs.item_id',
'elevenlabs.optimize_streaming_latency': 'elevenlabs.optimization',
};
module.exports = TaskSay; module.exports = TaskSay;

View File

@@ -177,7 +177,6 @@ class Task extends Emitter {
const makeTask = require('./make_task'); const makeTask = require('./make_task');
const tasks = normalizeJambones(this.logger, json).map((tdata) => makeTask(this.logger, tdata)); const tasks = normalizeJambones(this.logger, json).map((tdata) => makeTask(this.logger, tdata));
if (tasks && tasks.length > 0) { if (tasks && tasks.length > 0) {
this.logger.info({tasks: tasks}, `${this.name} replacing application with ${tasks.length} tasks`);
this.callSession.replaceApplication(tasks); this.callSession.replaceApplication(tasks);
} }
} }

View File

@@ -176,6 +176,7 @@ function installSrfLocals(srf, logger) {
const registrar = new Registrar(logger, client); const registrar = new Registrar(logger, client);
const { const {
synthAudio, synthAudio,
addFileToCache,
getNuanceAccessToken, getNuanceAccessToken,
getIbmAccessToken, getIbmAccessToken,
} = require('@jambonz/speech-utils')({}, logger); } = require('@jambonz/speech-utils')({}, logger);
@@ -215,6 +216,7 @@ function installSrfLocals(srf, logger) {
listCalls, listCalls,
deleteCall, deleteCall,
synthAudio, synthAudio,
addFileToCache,
createHash, createHash,
retrieveHash, retrieveHash,
deleteKey, deleteKey,

View File

@@ -16,6 +16,9 @@ const uuidv4 = require('uuid-random');
const HttpRequestor = require('./http-requestor'); const HttpRequestor = require('./http-requestor');
const WsRequestor = require('./ws-requestor'); const WsRequestor = require('./ws-requestor');
const {makeOpusFirst} = require('./sdp-utils'); const {makeOpusFirst} = require('./sdp-utils');
const {
JAMBONES_USE_FREESWITCH_TIMER_FD
} = require('../config');
class SingleDialer extends Emitter { class SingleDialer extends Emitter {
constructor({logger, sbcAddress, target, opts, application, callInfo, accountInfo, rootSpan, startSpan, dialTask, constructor({logger, sbcAddress, target, opts, application, callInfo, accountInfo, rootSpan, startSpan, dialTask,
@@ -324,8 +327,12 @@ class SingleDialer extends Emitter {
} }
_configMsEndpoint() { _configMsEndpoint() {
if (this.onHoldMusic) { const opts = {
this.ep.set({hold_music: `shout://${this.onHoldMusic.replace(/^https?:\/\//, '')}`}); ...(this.onHoldMusic && {holdMusic: `shout://${this.onHoldMusic.replace(/^https?:\/\//, '')}`}),
...(JAMBONES_USE_FREESWITCH_TIMER_FD && {timer_name: 'timerfd'})
};
if (Object.keys(opts).length > 0) {
this.ep.set(opts);
} }
} }

View File

@@ -107,30 +107,30 @@ const optimalDeepramModels = {
'zh-CN':['base', 'base'], 'zh-CN':['base', 'base'],
'zh-TW': ['base', 'base'], 'zh-TW': ['base', 'base'],
da: ['enhanced', 'enhanced'], da: ['enhanced', 'enhanced'],
en: ['nova-2-conversationalai', 'nova-2'], en: ['nova-2-phonecall', 'nova-2'],
'en-US': ['nova-2-conversationalai', 'nova-2'], 'en-US': ['nova-2-phonecall', 'nova-2'],
'en-AU': ['nova-2-conversationalai', 'nova-2'], 'en-AU': ['nova-2-phonecall', 'nova-2'],
'en-GB': ['nova-2-conversationalai', 'nova-2'], 'en-GB': ['nova-2-phonecall', 'nova-2'],
'en-IN': ['nova-2-conversationalai', 'nova-2'], 'en-IN': ['nova-2-phonecall', 'nova-2'],
'en-NZ': ['nova-2-conversationalai', 'nova-2'], 'en-NZ': ['nova-2-phonecall', 'nova-2'],
nl: ['nova-2-conversationalai', 'nova-2'], nl: ['nova-2-phonecall', 'nova-2'],
fr: ['nova-2-conversationalai', 'nova-2'], fr: ['nova-2-phonecall', 'nova-2'],
'fr-CA': ['nova-2-conversationalai', 'nova-2'], 'fr-CA': ['nova-2-phonecall', 'nova-2'],
de: ['nova-2-conversationalai', 'nova-2'], de: ['nova-2-phonecall', 'nova-2'],
hi: ['nova-2-conversationalai', 'nova-2'], hi: ['nova-2-phonecall', 'nova-2'],
'hi-Latn': ['nova-2-conversationalai', 'nova-2'], 'hi-Latn': ['nova-2-phonecall', 'nova-2'],
id: ['base', 'base'], id: ['base', 'base'],
it: ['enhanced', 'enhanced'], it: ['enhanced', 'enhanced'],
ja: ['enhanced', 'enhanced'], ja: ['enhanced', 'enhanced'],
ko: ['enhanced', 'enhanced'], ko: ['enhanced', 'enhanced'],
no: ['enhanced', 'enhanced'], no: ['enhanced', 'enhanced'],
pl: ['enhanced', 'enhanced'], pl: ['enhanced', 'enhanced'],
pt: ['nova-2-conversationalai', 'nova-2'], pt: ['nova-2-phonecall', 'nova-2'],
'pt-BR': ['nova-2-conversationalai', 'nova-2'], 'pt-BR': ['nova-2-phonecall', 'nova-2'],
'pt-PT': ['base', 'base'], 'pt-PT': ['base', 'base'],
ru: ['base', 'base'], ru: ['base', 'base'],
es: ['nova-2-conversationalai', 'nova-2'], es: ['nova-2-phonecall', 'nova-2'],
'es-419': ['nova-2-conversationalai', 'nova-2'], 'es-419': ['nova-2-phonecall', 'nova-2'],
'es-LATAM': ['enhanced', 'enhanced'], 'es-LATAM': ['enhanced', 'enhanced'],
sv: ['enhanced', 'enhanced'], sv: ['enhanced', 'enhanced'],
ta: ['enhanced', 'enhanced'], ta: ['enhanced', 'enhanced'],

6872
package-lock.json generated

File diff suppressed because it is too large Load Diff

View File

@@ -3,7 +3,7 @@
"version": "0.8.5", "version": "0.8.5",
"main": "app.js", "main": "app.js",
"engines": { "engines": {
"node": ">= 10.16.0" "node": ">= 18.x"
}, },
"keywords": [ "keywords": [
"sip", "sip",
@@ -31,7 +31,7 @@
"@jambonz/http-health-check": "^0.0.1", "@jambonz/http-health-check": "^0.0.1",
"@jambonz/mw-registrar": "^0.2.4", "@jambonz/mw-registrar": "^0.2.4",
"@jambonz/realtimedb-helpers": "^0.8.7", "@jambonz/realtimedb-helpers": "^0.8.7",
"@jambonz/speech-utils": "^0.0.33", "@jambonz/speech-utils": "^0.0.38",
"@jambonz/stats-collector": "^0.1.9", "@jambonz/stats-collector": "^0.1.9",
"@jambonz/time-series": "^0.2.8", "@jambonz/time-series": "^0.2.8",
"@jambonz/verb-specifications": "^0.0.50", "@jambonz/verb-specifications": "^0.0.50",
@@ -47,7 +47,7 @@
"bent": "^7.3.12", "bent": "^7.3.12",
"debug": "^4.3.4", "debug": "^4.3.4",
"deepcopy": "^2.1.0", "deepcopy": "^2.1.0",
"drachtio-fsmrf": "^3.0.33", "drachtio-fsmrf": "^3.0.37",
"drachtio-srf": "^4.5.31", "drachtio-srf": "^4.5.31",
"express": "^4.18.2", "express": "^4.18.2",
"express-validator": "^7.0.1", "express-validator": "^7.0.1",