mirror of
https://github.com/jambonz/jambonz-feature-server.git
synced 2025-12-20 08:40:38 +00:00
Tts/elevenlabs streaming (#629)
* update to fsmrf with fix * changes to support elevenlabs tts streaming * say: add vendor data to span * bug: tts spans must include cached property * add env for JAMBONES_USE_FREESWITCH_TIMER_FD * fix bug in prev commit * wip * linting * wip - caching files generating by streaming tts * wip caching * cleanup some logs * handle tts streaming failure, write alert * update node version dependency * set timerfd on outbound call scenarios * default model to nova-2-phonecall when using deepgram --------- Co-authored-by: Dave Horton <daveh@beachdognet.com>
This commit is contained in:
@@ -132,6 +132,8 @@ const JAMBONES_DISABLE_DIRECT_P2P_CALL = process.env.JAMBONES_DISABLE_DIRECT_P2P
|
||||
|
||||
const JAMBONES_EAGERLY_PRE_CACHE_AUDIO = process.env.JAMBONES_EAGERLY_PRE_CACHE_AUDIO;
|
||||
|
||||
const JAMBONES_USE_FREESWITCH_TIMER_FD = process.env.JAMBONES_USE_FREESWITCH_TIMER_FD;
|
||||
|
||||
module.exports = {
|
||||
JAMBONES_MYSQL_HOST,
|
||||
JAMBONES_MYSQL_USER,
|
||||
@@ -213,5 +215,6 @@ module.exports = {
|
||||
JAMBONZ_RECORD_WS_USERNAME,
|
||||
JAMBONZ_RECORD_WS_PASSWORD,
|
||||
JAMBONZ_DISABLE_DIAL_PAI_HEADER,
|
||||
JAMBONES_DISABLE_DIRECT_P2P_CALL
|
||||
JAMBONES_DISABLE_DIRECT_P2P_CALL,
|
||||
JAMBONES_USE_FREESWITCH_TIMER_FD
|
||||
};
|
||||
|
||||
@@ -21,6 +21,7 @@ const {
|
||||
JAMBONES_INJECT_CONTENT,
|
||||
JAMBONES_EAGERLY_PRE_CACHE_AUDIO,
|
||||
AWS_REGION,
|
||||
JAMBONES_USE_FREESWITCH_TIMER_FD
|
||||
} = require('../config');
|
||||
const BackgroundTaskManager = require('../utils/background-task-manager');
|
||||
const BADPRECONDITIONS = 'preconditions not met';
|
||||
@@ -689,7 +690,7 @@ class CallSession extends Emitter {
|
||||
(type === 'stt' && credential.use_for_stt)
|
||||
)) {
|
||||
this.logger.info(
|
||||
`Speech vendor: ${credential.vendor} ${credential.label ? `, label: ${credential.label}` : ''} selected`);
|
||||
`${type}: ${credential.vendor} ${credential.label ? `, label: ${credential.label}` : ''} `);
|
||||
if ('google' === vendor) {
|
||||
if (type === 'tts' && !credential.tts_tested_ok ||
|
||||
type === 'stt' && !credential.stt_tested_ok) {
|
||||
@@ -2033,8 +2034,12 @@ Duration=${duration} `
|
||||
}
|
||||
|
||||
_configMsEndpoint() {
|
||||
if (this.onHoldMusic) {
|
||||
this.ep.set({hold_music: `shout://${this.onHoldMusic.replace(/^https?:\/\//, '')}`});
|
||||
const opts = {
|
||||
...(this.onHoldMusic && {holdMusic: `shout://${this.onHoldMusic.replace(/^https?:\/\//, '')}`}),
|
||||
...(JAMBONES_USE_FREESWITCH_TIMER_FD && {timer_name: 'timerfd'})
|
||||
};
|
||||
if (Object.keys(opts).length > 0) {
|
||||
this.ep.set(opts);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -123,8 +123,6 @@ class TaskListen extends Task {
|
||||
ci,
|
||||
this.metadata);
|
||||
if (this.hook.auth) {
|
||||
this.logger.debug({username: this.hook.auth.username, password: this.hook.auth.password},
|
||||
'TaskListen:_startListening basic auth');
|
||||
await this.ep.set({
|
||||
'MOD_AUDIO_BASIC_AUTH_USERNAME': this.hook.auth.username,
|
||||
'MOD_AUDIO_BASIC_AUTH_PASSWORD': this.hook.auth.password
|
||||
|
||||
106
lib/tasks/say.js
106
lib/tasks/say.js
@@ -23,6 +23,12 @@ const breakLengthyTextIfNeeded = (logger, text) => {
|
||||
}
|
||||
};
|
||||
|
||||
const parseTextFromSayString = (text) => {
|
||||
const closingBraceIndex = text.indexOf('}');
|
||||
if (closingBraceIndex === -1) return text;
|
||||
return text.slice(closingBraceIndex + 1);
|
||||
};
|
||||
|
||||
class TaskSay extends Task {
|
||||
constructor(logger, opts, parentTask) {
|
||||
super(logger, opts);
|
||||
@@ -60,7 +66,7 @@ class TaskSay extends Task {
|
||||
}
|
||||
|
||||
async _synthesizeWithSpecificVendor(cs, ep, {vendor, language, voice, label, preCache = false}) {
|
||||
const {srf} = cs;
|
||||
const {srf, accountSid:account_sid} = cs;
|
||||
const {updateSpeechCredentialLastUsed} = require('../utils/db-utils')(this.logger, srf);
|
||||
const {writeAlerts, AlertType, stats} = srf.locals;
|
||||
const {synthAudio} = srf.locals.dbHelpers;
|
||||
@@ -97,11 +103,17 @@ class TaskSay extends Task {
|
||||
voice = this.options.voice_id || voice;
|
||||
}
|
||||
|
||||
this.ep.set({
|
||||
tts_engine: vendor,
|
||||
tts_voice: voice,
|
||||
cache_speech_handles: 1,
|
||||
}).catch((err) => this.logger.info({err}, 'Error setting tts_engine on endpoint'));
|
||||
|
||||
if (!preCache) this.logger.info({vendor, language, voice, model}, 'TaskSay:exec');
|
||||
try {
|
||||
if (!credentials) {
|
||||
writeAlerts({
|
||||
account_sid: cs.accountSid,
|
||||
account_sid,
|
||||
alert_type: AlertType.TTS_NOT_PROVISIONED,
|
||||
vendor
|
||||
}).catch((err) => this.logger.info({err}, 'Error generating alert for no tts'));
|
||||
@@ -120,18 +132,17 @@ class TaskSay extends Task {
|
||||
if (text.startsWith('silence_stream://')) return text;
|
||||
|
||||
/* otel: trace time for tts */
|
||||
let otelSpan;
|
||||
if (!preCache) {
|
||||
const {span} = this.startChildSpan('tts-generation', {
|
||||
'tts.vendor': vendor,
|
||||
'tts.language': language,
|
||||
'tts.voice': voice
|
||||
});
|
||||
otelSpan = span;
|
||||
this.otelSpan = span;
|
||||
}
|
||||
try {
|
||||
const {filePath, servedFromCache, rtt} = await synthAudio(stats, {
|
||||
account_sid: cs.accountSid,
|
||||
account_sid,
|
||||
text,
|
||||
vendor,
|
||||
language,
|
||||
@@ -141,17 +152,21 @@ class TaskSay extends Task {
|
||||
salt,
|
||||
credentials,
|
||||
options: this.options,
|
||||
disableTtsCache : this.disableTtsCache
|
||||
disableTtsCache : this.disableTtsCache,
|
||||
preCache
|
||||
});
|
||||
if (!filePath.startsWith('say:')) {
|
||||
this.logger.debug(`file ${filePath}, served from cache ${servedFromCache}`);
|
||||
if (filePath) cs.trackTmpFile(filePath);
|
||||
if (this.otelSpan) {
|
||||
this.otelSpan.setAttributes({'tts.cached': servedFromCache});
|
||||
this.otelSpan.end();
|
||||
this.otelSpan = null;
|
||||
}
|
||||
if (!servedFromCache && !lastUpdated) {
|
||||
lastUpdated = true;
|
||||
updateSpeechCredentialLastUsed(credentials.speech_credential_sid)
|
||||
.catch(() => {/*already logged error */});
|
||||
updateSpeechCredentialLastUsed(credentials.speech_credential_sid).catch(() => {/* logged error */});
|
||||
}
|
||||
if (otelSpan) otelSpan.setAttributes({'tts.cached': servedFromCache});
|
||||
if (otelSpan) otelSpan.end();
|
||||
if (!servedFromCache && rtt && !preCache) {
|
||||
this.notifyStatus({
|
||||
event: 'synthesized-audio',
|
||||
@@ -161,10 +176,16 @@ class TaskSay extends Task {
|
||||
elapsedTime: rtt
|
||||
});
|
||||
}
|
||||
}
|
||||
else {
|
||||
this.logger.debug('a streaming tts api will be used');
|
||||
const modifiedPath = filePath.replace('say:{', `say:{session-uuid=${this.ep.uuid},`);
|
||||
return modifiedPath;
|
||||
}
|
||||
return filePath;
|
||||
} catch (err) {
|
||||
this.logger.info({err}, 'Error synthesizing tts');
|
||||
if (otelSpan) otelSpan.end();
|
||||
if (this.otelSpan) this.otelSpan.end();
|
||||
writeAlerts({
|
||||
account_sid: cs.accountSid,
|
||||
alert_type: AlertType.TTS_FAILURE,
|
||||
@@ -186,6 +207,11 @@ class TaskSay extends Task {
|
||||
}
|
||||
|
||||
async exec(cs, {ep}) {
|
||||
const {srf, accountSid:account_sid} = cs;
|
||||
const {writeAlerts, AlertType} = srf.locals;
|
||||
const {addFileToCache} = srf.locals.dbHelpers;
|
||||
const engine = this.synthesizer.engine || 'standard';
|
||||
|
||||
await super.exec(cs);
|
||||
this.ep = ep;
|
||||
|
||||
@@ -243,7 +269,39 @@ class TaskSay extends Task {
|
||||
await this.playToConfMember(this.ep, memberId, confName, confUuid, filepath[segment]);
|
||||
}
|
||||
else {
|
||||
this.logger.debug(`Say:exec sending command to play file ${filepath[segment]}`);
|
||||
this.logger.debug(`Say:exec sending ${filepath[segment].substring(0, 64)}`);
|
||||
this.ep.once('playback-start', (evt) => {
|
||||
this.logger.debug({evt}, 'got playback-start');
|
||||
if (this.otelSpan) {
|
||||
this.logger.debug({evt}, 'got playback-start');
|
||||
this._addStreamingTtsAttributes(this.otelSpan, evt);
|
||||
this.otelSpan.end();
|
||||
this.otelSpan = null;
|
||||
if (evt.variable_tts_cache_filename) cs.trackTmpFile(evt.variable_tts_cache_filename);
|
||||
}
|
||||
});
|
||||
this.ep.once('playback-stop', (evt) => {
|
||||
this.logger.debug({evt}, 'got playback-stop');
|
||||
if (evt.variable_tts_error) {
|
||||
writeAlerts({
|
||||
account_sid,
|
||||
alert_type: AlertType.TTS_FAILURE,
|
||||
vendor,
|
||||
detail: evt.variable_tts_error
|
||||
}).catch((err) => this.logger.info({err}, 'Error generating alert for no tts'));
|
||||
}
|
||||
if (evt.variable_tts_cache_filename) {
|
||||
const text = parseTextFromSayString(this.text[segment]);
|
||||
addFileToCache(evt.variable_tts_cache_filename, {
|
||||
account_sid,
|
||||
vendor,
|
||||
language,
|
||||
voice,
|
||||
engine,
|
||||
text
|
||||
}).catch((err) => this.logger.info({err}, 'Error adding file to cache'));
|
||||
}
|
||||
});
|
||||
await ep.play(filepath[segment]);
|
||||
this.logger.debug(`Say:exec completed play file ${filepath[segment]}`);
|
||||
}
|
||||
@@ -265,8 +323,30 @@ class TaskSay extends Task {
|
||||
this.notifyStatus({event: 'kill-playback'});
|
||||
this.ep.api('uuid_break', this.ep.uuid);
|
||||
}
|
||||
}
|
||||
this.ep.removeAllListeners('playback-start');
|
||||
this.ep.removeAllListeners('playback-stop');
|
||||
}
|
||||
}
|
||||
|
||||
_addStreamingTtsAttributes(span, evt) {
|
||||
const attrs = {'tts.cached': false};
|
||||
for (const [key, value] of Object.entries(evt)) {
|
||||
if (key.startsWith('variable_tts_')) {
|
||||
let newKey = key.substring('variable_tts_'.length)
|
||||
.replace('elevenlabs_', 'elevenlabs.');
|
||||
if (spanMapping[newKey]) newKey = spanMapping[newKey];
|
||||
attrs[newKey] = value;
|
||||
}
|
||||
}
|
||||
span.setAttributes(attrs);
|
||||
}
|
||||
}
|
||||
|
||||
const spanMapping = {
|
||||
'elevenlabs.reported_latency_ms': 'elevenlabs.latency_ms',
|
||||
'elevenlabs.request_id': 'elevenlabs.req_id',
|
||||
'elevenlabs.history_item_id': 'elevenlabs.item_id',
|
||||
'elevenlabs.optimize_streaming_latency': 'elevenlabs.optimization',
|
||||
};
|
||||
|
||||
module.exports = TaskSay;
|
||||
|
||||
@@ -177,7 +177,6 @@ class Task extends Emitter {
|
||||
const makeTask = require('./make_task');
|
||||
const tasks = normalizeJambones(this.logger, json).map((tdata) => makeTask(this.logger, tdata));
|
||||
if (tasks && tasks.length > 0) {
|
||||
this.logger.info({tasks: tasks}, `${this.name} replacing application with ${tasks.length} tasks`);
|
||||
this.callSession.replaceApplication(tasks);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -176,6 +176,7 @@ function installSrfLocals(srf, logger) {
|
||||
const registrar = new Registrar(logger, client);
|
||||
const {
|
||||
synthAudio,
|
||||
addFileToCache,
|
||||
getNuanceAccessToken,
|
||||
getIbmAccessToken,
|
||||
} = require('@jambonz/speech-utils')({}, logger);
|
||||
@@ -215,6 +216,7 @@ function installSrfLocals(srf, logger) {
|
||||
listCalls,
|
||||
deleteCall,
|
||||
synthAudio,
|
||||
addFileToCache,
|
||||
createHash,
|
||||
retrieveHash,
|
||||
deleteKey,
|
||||
|
||||
@@ -16,6 +16,9 @@ const uuidv4 = require('uuid-random');
|
||||
const HttpRequestor = require('./http-requestor');
|
||||
const WsRequestor = require('./ws-requestor');
|
||||
const {makeOpusFirst} = require('./sdp-utils');
|
||||
const {
|
||||
JAMBONES_USE_FREESWITCH_TIMER_FD
|
||||
} = require('../config');
|
||||
|
||||
class SingleDialer extends Emitter {
|
||||
constructor({logger, sbcAddress, target, opts, application, callInfo, accountInfo, rootSpan, startSpan, dialTask,
|
||||
@@ -324,8 +327,12 @@ class SingleDialer extends Emitter {
|
||||
}
|
||||
|
||||
_configMsEndpoint() {
|
||||
if (this.onHoldMusic) {
|
||||
this.ep.set({hold_music: `shout://${this.onHoldMusic.replace(/^https?:\/\//, '')}`});
|
||||
const opts = {
|
||||
...(this.onHoldMusic && {holdMusic: `shout://${this.onHoldMusic.replace(/^https?:\/\//, '')}`}),
|
||||
...(JAMBONES_USE_FREESWITCH_TIMER_FD && {timer_name: 'timerfd'})
|
||||
};
|
||||
if (Object.keys(opts).length > 0) {
|
||||
this.ep.set(opts);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -107,30 +107,30 @@ const optimalDeepramModels = {
|
||||
'zh-CN':['base', 'base'],
|
||||
'zh-TW': ['base', 'base'],
|
||||
da: ['enhanced', 'enhanced'],
|
||||
en: ['nova-2-conversationalai', 'nova-2'],
|
||||
'en-US': ['nova-2-conversationalai', 'nova-2'],
|
||||
'en-AU': ['nova-2-conversationalai', 'nova-2'],
|
||||
'en-GB': ['nova-2-conversationalai', 'nova-2'],
|
||||
'en-IN': ['nova-2-conversationalai', 'nova-2'],
|
||||
'en-NZ': ['nova-2-conversationalai', 'nova-2'],
|
||||
nl: ['nova-2-conversationalai', 'nova-2'],
|
||||
fr: ['nova-2-conversationalai', 'nova-2'],
|
||||
'fr-CA': ['nova-2-conversationalai', 'nova-2'],
|
||||
de: ['nova-2-conversationalai', 'nova-2'],
|
||||
hi: ['nova-2-conversationalai', 'nova-2'],
|
||||
'hi-Latn': ['nova-2-conversationalai', 'nova-2'],
|
||||
en: ['nova-2-phonecall', 'nova-2'],
|
||||
'en-US': ['nova-2-phonecall', 'nova-2'],
|
||||
'en-AU': ['nova-2-phonecall', 'nova-2'],
|
||||
'en-GB': ['nova-2-phonecall', 'nova-2'],
|
||||
'en-IN': ['nova-2-phonecall', 'nova-2'],
|
||||
'en-NZ': ['nova-2-phonecall', 'nova-2'],
|
||||
nl: ['nova-2-phonecall', 'nova-2'],
|
||||
fr: ['nova-2-phonecall', 'nova-2'],
|
||||
'fr-CA': ['nova-2-phonecall', 'nova-2'],
|
||||
de: ['nova-2-phonecall', 'nova-2'],
|
||||
hi: ['nova-2-phonecall', 'nova-2'],
|
||||
'hi-Latn': ['nova-2-phonecall', 'nova-2'],
|
||||
id: ['base', 'base'],
|
||||
it: ['enhanced', 'enhanced'],
|
||||
ja: ['enhanced', 'enhanced'],
|
||||
ko: ['enhanced', 'enhanced'],
|
||||
no: ['enhanced', 'enhanced'],
|
||||
pl: ['enhanced', 'enhanced'],
|
||||
pt: ['nova-2-conversationalai', 'nova-2'],
|
||||
'pt-BR': ['nova-2-conversationalai', 'nova-2'],
|
||||
pt: ['nova-2-phonecall', 'nova-2'],
|
||||
'pt-BR': ['nova-2-phonecall', 'nova-2'],
|
||||
'pt-PT': ['base', 'base'],
|
||||
ru: ['base', 'base'],
|
||||
es: ['nova-2-conversationalai', 'nova-2'],
|
||||
'es-419': ['nova-2-conversationalai', 'nova-2'],
|
||||
es: ['nova-2-phonecall', 'nova-2'],
|
||||
'es-419': ['nova-2-phonecall', 'nova-2'],
|
||||
'es-LATAM': ['enhanced', 'enhanced'],
|
||||
sv: ['enhanced', 'enhanced'],
|
||||
ta: ['enhanced', 'enhanced'],
|
||||
|
||||
6862
package-lock.json
generated
6862
package-lock.json
generated
File diff suppressed because it is too large
Load Diff
@@ -3,7 +3,7 @@
|
||||
"version": "0.8.5",
|
||||
"main": "app.js",
|
||||
"engines": {
|
||||
"node": ">= 10.16.0"
|
||||
"node": ">= 18.x"
|
||||
},
|
||||
"keywords": [
|
||||
"sip",
|
||||
@@ -31,7 +31,7 @@
|
||||
"@jambonz/http-health-check": "^0.0.1",
|
||||
"@jambonz/mw-registrar": "^0.2.4",
|
||||
"@jambonz/realtimedb-helpers": "^0.8.7",
|
||||
"@jambonz/speech-utils": "^0.0.33",
|
||||
"@jambonz/speech-utils": "^0.0.38",
|
||||
"@jambonz/stats-collector": "^0.1.9",
|
||||
"@jambonz/time-series": "^0.2.8",
|
||||
"@jambonz/verb-specifications": "^0.0.50",
|
||||
@@ -47,7 +47,7 @@
|
||||
"bent": "^7.3.12",
|
||||
"debug": "^4.3.4",
|
||||
"deepcopy": "^2.1.0",
|
||||
"drachtio-fsmrf": "^3.0.33",
|
||||
"drachtio-fsmrf": "^3.0.37",
|
||||
"drachtio-srf": "^4.5.31",
|
||||
"express": "^4.18.2",
|
||||
"express-validator": "^7.0.1",
|
||||
|
||||
Reference in New Issue
Block a user