Tts/elevenlabs streaming (#629)

* update to fsmrf with fix

* changes to support elevenlabs tts streaming

* say: add vendor data to span

* bug: tts spans must include cached property

* add env for JAMBONES_USE_FREESWITCH_TIMER_FD

* fix bug in prev commit

* wip

* linting

* wip - caching files generating by streaming tts

* wip caching

* cleanup some logs

* handle tts streaming failure, write alert

* update node version dependency

* set timerfd on outbound call scenarios

* default model to nova-2-phonecall when using deepgram

---------

Co-authored-by: Dave Horton <daveh@beachdognet.com>
This commit is contained in:
Hoan Luu Huu
2024-02-07 20:49:36 +07:00
committed by GitHub
parent 48a81072e8
commit a55f81676b
10 changed files with 2932 additions and 4138 deletions

View File

@@ -132,6 +132,8 @@ const JAMBONES_DISABLE_DIRECT_P2P_CALL = process.env.JAMBONES_DISABLE_DIRECT_P2P
const JAMBONES_EAGERLY_PRE_CACHE_AUDIO = process.env.JAMBONES_EAGERLY_PRE_CACHE_AUDIO;
const JAMBONES_USE_FREESWITCH_TIMER_FD = process.env.JAMBONES_USE_FREESWITCH_TIMER_FD;
module.exports = {
JAMBONES_MYSQL_HOST,
JAMBONES_MYSQL_USER,
@@ -213,5 +215,6 @@ module.exports = {
JAMBONZ_RECORD_WS_USERNAME,
JAMBONZ_RECORD_WS_PASSWORD,
JAMBONZ_DISABLE_DIAL_PAI_HEADER,
JAMBONES_DISABLE_DIRECT_P2P_CALL
JAMBONES_DISABLE_DIRECT_P2P_CALL,
JAMBONES_USE_FREESWITCH_TIMER_FD
};

View File

@@ -21,6 +21,7 @@ const {
JAMBONES_INJECT_CONTENT,
JAMBONES_EAGERLY_PRE_CACHE_AUDIO,
AWS_REGION,
JAMBONES_USE_FREESWITCH_TIMER_FD
} = require('../config');
const BackgroundTaskManager = require('../utils/background-task-manager');
const BADPRECONDITIONS = 'preconditions not met';
@@ -689,7 +690,7 @@ class CallSession extends Emitter {
(type === 'stt' && credential.use_for_stt)
)) {
this.logger.info(
`Speech vendor: ${credential.vendor} ${credential.label ? `, label: ${credential.label}` : ''} selected`);
`${type}: ${credential.vendor} ${credential.label ? `, label: ${credential.label}` : ''} `);
if ('google' === vendor) {
if (type === 'tts' && !credential.tts_tested_ok ||
type === 'stt' && !credential.stt_tested_ok) {
@@ -2033,8 +2034,12 @@ Duration=${duration} `
}
_configMsEndpoint() {
if (this.onHoldMusic) {
this.ep.set({hold_music: `shout://${this.onHoldMusic.replace(/^https?:\/\//, '')}`});
const opts = {
...(this.onHoldMusic && {holdMusic: `shout://${this.onHoldMusic.replace(/^https?:\/\//, '')}`}),
...(JAMBONES_USE_FREESWITCH_TIMER_FD && {timer_name: 'timerfd'})
};
if (Object.keys(opts).length > 0) {
this.ep.set(opts);
}
}

View File

@@ -123,8 +123,6 @@ class TaskListen extends Task {
ci,
this.metadata);
if (this.hook.auth) {
this.logger.debug({username: this.hook.auth.username, password: this.hook.auth.password},
'TaskListen:_startListening basic auth');
await this.ep.set({
'MOD_AUDIO_BASIC_AUTH_USERNAME': this.hook.auth.username,
'MOD_AUDIO_BASIC_AUTH_PASSWORD': this.hook.auth.password

View File

@@ -23,6 +23,12 @@ const breakLengthyTextIfNeeded = (logger, text) => {
}
};
const parseTextFromSayString = (text) => {
const closingBraceIndex = text.indexOf('}');
if (closingBraceIndex === -1) return text;
return text.slice(closingBraceIndex + 1);
};
class TaskSay extends Task {
constructor(logger, opts, parentTask) {
super(logger, opts);
@@ -60,7 +66,7 @@ class TaskSay extends Task {
}
async _synthesizeWithSpecificVendor(cs, ep, {vendor, language, voice, label, preCache = false}) {
const {srf} = cs;
const {srf, accountSid:account_sid} = cs;
const {updateSpeechCredentialLastUsed} = require('../utils/db-utils')(this.logger, srf);
const {writeAlerts, AlertType, stats} = srf.locals;
const {synthAudio} = srf.locals.dbHelpers;
@@ -97,11 +103,17 @@ class TaskSay extends Task {
voice = this.options.voice_id || voice;
}
this.ep.set({
tts_engine: vendor,
tts_voice: voice,
cache_speech_handles: 1,
}).catch((err) => this.logger.info({err}, 'Error setting tts_engine on endpoint'));
if (!preCache) this.logger.info({vendor, language, voice, model}, 'TaskSay:exec');
try {
if (!credentials) {
writeAlerts({
account_sid: cs.accountSid,
account_sid,
alert_type: AlertType.TTS_NOT_PROVISIONED,
vendor
}).catch((err) => this.logger.info({err}, 'Error generating alert for no tts'));
@@ -120,18 +132,17 @@ class TaskSay extends Task {
if (text.startsWith('silence_stream://')) return text;
/* otel: trace time for tts */
let otelSpan;
if (!preCache) {
const {span} = this.startChildSpan('tts-generation', {
'tts.vendor': vendor,
'tts.language': language,
'tts.voice': voice
});
otelSpan = span;
this.otelSpan = span;
}
try {
const {filePath, servedFromCache, rtt} = await synthAudio(stats, {
account_sid: cs.accountSid,
account_sid,
text,
vendor,
language,
@@ -141,30 +152,40 @@ class TaskSay extends Task {
salt,
credentials,
options: this.options,
disableTtsCache : this.disableTtsCache
disableTtsCache : this.disableTtsCache,
preCache
});
this.logger.debug(`file ${filePath}, served from cache ${servedFromCache}`);
if (filePath) cs.trackTmpFile(filePath);
if (!servedFromCache && !lastUpdated) {
lastUpdated = true;
updateSpeechCredentialLastUsed(credentials.speech_credential_sid)
.catch(() => {/*already logged error */});
if (!filePath.startsWith('say:')) {
this.logger.debug(`file ${filePath}, served from cache ${servedFromCache}`);
if (filePath) cs.trackTmpFile(filePath);
if (this.otelSpan) {
this.otelSpan.setAttributes({'tts.cached': servedFromCache});
this.otelSpan.end();
this.otelSpan = null;
}
if (!servedFromCache && !lastUpdated) {
lastUpdated = true;
updateSpeechCredentialLastUsed(credentials.speech_credential_sid).catch(() => {/* logged error */});
}
if (!servedFromCache && rtt && !preCache) {
this.notifyStatus({
event: 'synthesized-audio',
vendor,
language,
characters: text.length,
elapsedTime: rtt
});
}
}
if (otelSpan) otelSpan.setAttributes({'tts.cached': servedFromCache});
if (otelSpan) otelSpan.end();
if (!servedFromCache && rtt && !preCache) {
this.notifyStatus({
event: 'synthesized-audio',
vendor,
language,
characters: text.length,
elapsedTime: rtt
});
else {
this.logger.debug('a streaming tts api will be used');
const modifiedPath = filePath.replace('say:{', `say:{session-uuid=${this.ep.uuid},`);
return modifiedPath;
}
return filePath;
} catch (err) {
this.logger.info({err}, 'Error synthesizing tts');
if (otelSpan) otelSpan.end();
if (this.otelSpan) this.otelSpan.end();
writeAlerts({
account_sid: cs.accountSid,
alert_type: AlertType.TTS_FAILURE,
@@ -186,6 +207,11 @@ class TaskSay extends Task {
}
async exec(cs, {ep}) {
const {srf, accountSid:account_sid} = cs;
const {writeAlerts, AlertType} = srf.locals;
const {addFileToCache} = srf.locals.dbHelpers;
const engine = this.synthesizer.engine || 'standard';
await super.exec(cs);
this.ep = ep;
@@ -243,7 +269,39 @@ class TaskSay extends Task {
await this.playToConfMember(this.ep, memberId, confName, confUuid, filepath[segment]);
}
else {
this.logger.debug(`Say:exec sending command to play file ${filepath[segment]}`);
this.logger.debug(`Say:exec sending ${filepath[segment].substring(0, 64)}`);
this.ep.once('playback-start', (evt) => {
this.logger.debug({evt}, 'got playback-start');
if (this.otelSpan) {
this.logger.debug({evt}, 'got playback-start');
this._addStreamingTtsAttributes(this.otelSpan, evt);
this.otelSpan.end();
this.otelSpan = null;
if (evt.variable_tts_cache_filename) cs.trackTmpFile(evt.variable_tts_cache_filename);
}
});
this.ep.once('playback-stop', (evt) => {
this.logger.debug({evt}, 'got playback-stop');
if (evt.variable_tts_error) {
writeAlerts({
account_sid,
alert_type: AlertType.TTS_FAILURE,
vendor,
detail: evt.variable_tts_error
}).catch((err) => this.logger.info({err}, 'Error generating alert for no tts'));
}
if (evt.variable_tts_cache_filename) {
const text = parseTextFromSayString(this.text[segment]);
addFileToCache(evt.variable_tts_cache_filename, {
account_sid,
vendor,
language,
voice,
engine,
text
}).catch((err) => this.logger.info({err}, 'Error adding file to cache'));
}
});
await ep.play(filepath[segment]);
this.logger.debug(`Say:exec completed play file ${filepath[segment]}`);
}
@@ -265,8 +323,30 @@ class TaskSay extends Task {
this.notifyStatus({event: 'kill-playback'});
this.ep.api('uuid_break', this.ep.uuid);
}
this.ep.removeAllListeners('playback-start');
this.ep.removeAllListeners('playback-stop');
}
}
_addStreamingTtsAttributes(span, evt) {
const attrs = {'tts.cached': false};
for (const [key, value] of Object.entries(evt)) {
if (key.startsWith('variable_tts_')) {
let newKey = key.substring('variable_tts_'.length)
.replace('elevenlabs_', 'elevenlabs.');
if (spanMapping[newKey]) newKey = spanMapping[newKey];
attrs[newKey] = value;
}
}
span.setAttributes(attrs);
}
}
const spanMapping = {
'elevenlabs.reported_latency_ms': 'elevenlabs.latency_ms',
'elevenlabs.request_id': 'elevenlabs.req_id',
'elevenlabs.history_item_id': 'elevenlabs.item_id',
'elevenlabs.optimize_streaming_latency': 'elevenlabs.optimization',
};
module.exports = TaskSay;

View File

@@ -177,7 +177,6 @@ class Task extends Emitter {
const makeTask = require('./make_task');
const tasks = normalizeJambones(this.logger, json).map((tdata) => makeTask(this.logger, tdata));
if (tasks && tasks.length > 0) {
this.logger.info({tasks: tasks}, `${this.name} replacing application with ${tasks.length} tasks`);
this.callSession.replaceApplication(tasks);
}
}

View File

@@ -176,6 +176,7 @@ function installSrfLocals(srf, logger) {
const registrar = new Registrar(logger, client);
const {
synthAudio,
addFileToCache,
getNuanceAccessToken,
getIbmAccessToken,
} = require('@jambonz/speech-utils')({}, logger);
@@ -215,6 +216,7 @@ function installSrfLocals(srf, logger) {
listCalls,
deleteCall,
synthAudio,
addFileToCache,
createHash,
retrieveHash,
deleteKey,

View File

@@ -16,6 +16,9 @@ const uuidv4 = require('uuid-random');
const HttpRequestor = require('./http-requestor');
const WsRequestor = require('./ws-requestor');
const {makeOpusFirst} = require('./sdp-utils');
const {
JAMBONES_USE_FREESWITCH_TIMER_FD
} = require('../config');
class SingleDialer extends Emitter {
constructor({logger, sbcAddress, target, opts, application, callInfo, accountInfo, rootSpan, startSpan, dialTask,
@@ -324,8 +327,12 @@ class SingleDialer extends Emitter {
}
_configMsEndpoint() {
if (this.onHoldMusic) {
this.ep.set({hold_music: `shout://${this.onHoldMusic.replace(/^https?:\/\//, '')}`});
const opts = {
...(this.onHoldMusic && {holdMusic: `shout://${this.onHoldMusic.replace(/^https?:\/\//, '')}`}),
...(JAMBONES_USE_FREESWITCH_TIMER_FD && {timer_name: 'timerfd'})
};
if (Object.keys(opts).length > 0) {
this.ep.set(opts);
}
}

View File

@@ -107,30 +107,30 @@ const optimalDeepramModels = {
'zh-CN':['base', 'base'],
'zh-TW': ['base', 'base'],
da: ['enhanced', 'enhanced'],
en: ['nova-2-conversationalai', 'nova-2'],
'en-US': ['nova-2-conversationalai', 'nova-2'],
'en-AU': ['nova-2-conversationalai', 'nova-2'],
'en-GB': ['nova-2-conversationalai', 'nova-2'],
'en-IN': ['nova-2-conversationalai', 'nova-2'],
'en-NZ': ['nova-2-conversationalai', 'nova-2'],
nl: ['nova-2-conversationalai', 'nova-2'],
fr: ['nova-2-conversationalai', 'nova-2'],
'fr-CA': ['nova-2-conversationalai', 'nova-2'],
de: ['nova-2-conversationalai', 'nova-2'],
hi: ['nova-2-conversationalai', 'nova-2'],
'hi-Latn': ['nova-2-conversationalai', 'nova-2'],
en: ['nova-2-phonecall', 'nova-2'],
'en-US': ['nova-2-phonecall', 'nova-2'],
'en-AU': ['nova-2-phonecall', 'nova-2'],
'en-GB': ['nova-2-phonecall', 'nova-2'],
'en-IN': ['nova-2-phonecall', 'nova-2'],
'en-NZ': ['nova-2-phonecall', 'nova-2'],
nl: ['nova-2-phonecall', 'nova-2'],
fr: ['nova-2-phonecall', 'nova-2'],
'fr-CA': ['nova-2-phonecall', 'nova-2'],
de: ['nova-2-phonecall', 'nova-2'],
hi: ['nova-2-phonecall', 'nova-2'],
'hi-Latn': ['nova-2-phonecall', 'nova-2'],
id: ['base', 'base'],
it: ['enhanced', 'enhanced'],
ja: ['enhanced', 'enhanced'],
ko: ['enhanced', 'enhanced'],
no: ['enhanced', 'enhanced'],
pl: ['enhanced', 'enhanced'],
pt: ['nova-2-conversationalai', 'nova-2'],
'pt-BR': ['nova-2-conversationalai', 'nova-2'],
pt: ['nova-2-phonecall', 'nova-2'],
'pt-BR': ['nova-2-phonecall', 'nova-2'],
'pt-PT': ['base', 'base'],
ru: ['base', 'base'],
es: ['nova-2-conversationalai', 'nova-2'],
'es-419': ['nova-2-conversationalai', 'nova-2'],
es: ['nova-2-phonecall', 'nova-2'],
'es-419': ['nova-2-phonecall', 'nova-2'],
'es-LATAM': ['enhanced', 'enhanced'],
sv: ['enhanced', 'enhanced'],
ta: ['enhanced', 'enhanced'],

6872
package-lock.json generated

File diff suppressed because it is too large Load Diff

View File

@@ -3,7 +3,7 @@
"version": "0.8.5",
"main": "app.js",
"engines": {
"node": ">= 10.16.0"
"node": ">= 18.x"
},
"keywords": [
"sip",
@@ -31,7 +31,7 @@
"@jambonz/http-health-check": "^0.0.1",
"@jambonz/mw-registrar": "^0.2.4",
"@jambonz/realtimedb-helpers": "^0.8.7",
"@jambonz/speech-utils": "^0.0.33",
"@jambonz/speech-utils": "^0.0.38",
"@jambonz/stats-collector": "^0.1.9",
"@jambonz/time-series": "^0.2.8",
"@jambonz/verb-specifications": "^0.0.50",
@@ -47,7 +47,7 @@
"bent": "^7.3.12",
"debug": "^4.3.4",
"deepcopy": "^2.1.0",
"drachtio-fsmrf": "^3.0.33",
"drachtio-fsmrf": "^3.0.37",
"drachtio-srf": "^4.5.31",
"express": "^4.18.2",
"express-validator": "^7.0.1",