mirror of
https://github.com/jambonz/jambonz-feature-server.git
synced 2025-12-20 16:50:39 +00:00
Fixes/ws testing dh (#704)
* fixes from testing with translator app * more updates * linting * update gh actions to node 20 * add support for google v2 preconfigured recognizer * add support for google voice activity events * update to speech-utils@0.0.45 * update speech-utils to support caching azure tts * transcribe must buffer transcripts for channel 1 and 2 separately * further fix for accumulating transcripts * linting * deepgram sends transcripts with empty alternatives array * fix deepgram returning an empty array
This commit is contained in:
@@ -112,13 +112,17 @@ class CallSession extends Emitter {
|
||||
this.requestor.removeAllListeners();
|
||||
this.application.requestor = newRequestor;
|
||||
this.requestor.on('command', this._onCommand.bind(this));
|
||||
this.logger.debug(`CallSession: ${this.callSid} listener count ${this.requestor.listenerCount('command')}`);
|
||||
this.requestor.on('connection-dropped', this._onWsConnectionDropped.bind(this));
|
||||
this.requestor.on('handover', handover.bind(this));
|
||||
};
|
||||
|
||||
this.requestor.on('command', this._onCommand.bind(this));
|
||||
this.requestor.on('connection-dropped', this._onWsConnectionDropped.bind(this));
|
||||
this.requestor.on('handover', handover.bind(this));
|
||||
if (!this.isConfirmCallSession) {
|
||||
this.requestor.on('command', this._onCommand.bind(this));
|
||||
this.logger.debug(`CallSession: ${this.callSid} listener count ${this.requestor.listenerCount('command')}`);
|
||||
this.requestor.on('connection-dropped', this._onWsConnectionDropped.bind(this));
|
||||
this.requestor.on('handover', handover.bind(this));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -1367,6 +1371,30 @@ Duration=${duration} `
|
||||
task.whisper(tasks, callSid).catch((err) => this.logger.error(err, 'CallSession:_lccWhisper'));
|
||||
}
|
||||
|
||||
async _lccConfig(opts) {
|
||||
this.logger.debug({opts}, 'CallSession:_lccConfig');
|
||||
const t = normalizeJambones(this.logger, [
|
||||
{
|
||||
verb: 'config',
|
||||
...opts
|
||||
}
|
||||
])
|
||||
.map((tdata) => makeTask(this.logger, tdata));
|
||||
|
||||
const task = t[0];
|
||||
|
||||
const {span, ctx} = this.rootSpan.startChildSpan(`verb:${task.summary}`);
|
||||
span.setAttributes({'verb.summary': task.summary});
|
||||
task.span = span;
|
||||
task.ctx = ctx;
|
||||
try {
|
||||
await task.exec(this, {ep: this.ep});
|
||||
} catch (err) {
|
||||
this.logger.error(err, 'CallSession:_lccConfig');
|
||||
}
|
||||
task.span.end();
|
||||
}
|
||||
|
||||
async _lccDub(opts, callSid) {
|
||||
this.logger.debug({opts}, `CallSession:_lccDub on call_sid ${callSid}`);
|
||||
const t = normalizeJambones(this.logger, [
|
||||
@@ -1377,23 +1405,24 @@ Duration=${duration} `
|
||||
])
|
||||
.map((tdata) => makeTask(this.logger, tdata));
|
||||
|
||||
const dubTask = t[0];
|
||||
const task = t[0];
|
||||
const ep = this.currentTask?.name === TaskName.Dial && callSid === this.currentTask?.callSid ?
|
||||
this.currentTask.ep :
|
||||
this.ep;
|
||||
|
||||
const {span, ctx} = this.rootSpan.startChildSpan(`verb:${dubTask.summary}`);
|
||||
span.setAttributes({'verb.summary': dubTask.summary});
|
||||
dubTask.span = span;
|
||||
dubTask.ctx = ctx;
|
||||
const {span, ctx} = this.rootSpan.startChildSpan(`verb:${task.summary}`);
|
||||
span.setAttributes({'verb.summary': task.summary});
|
||||
task.span = span;
|
||||
task.ctx = ctx;
|
||||
try {
|
||||
await dubTask.exec(this, {ep});
|
||||
await task.exec(this, {ep});
|
||||
} catch (err) {
|
||||
this.logger.error(err, 'CallSession:_lccDub');
|
||||
}
|
||||
dubTask.span.end();
|
||||
task.span.end();
|
||||
}
|
||||
|
||||
|
||||
async _lccBoostAudioSignal(opts, callSid) {
|
||||
const ep = this.currentTask?.name === TaskName.Dial && callSid === this.currentTask?.callSid ?
|
||||
this.currentTask.ep :
|
||||
@@ -1664,6 +1693,10 @@ Duration=${duration} `
|
||||
this._lccCallStatus(data);
|
||||
break;
|
||||
|
||||
case 'config':
|
||||
this._lccConfig(data, call_sid);
|
||||
break;
|
||||
|
||||
case 'dial':
|
||||
this._lccCallDial(data);
|
||||
break;
|
||||
@@ -1978,6 +2011,10 @@ Duration=${duration} `
|
||||
}
|
||||
this.logger.debug(`CallSession:propagateAnswer - answered callSid ${this.callSid}`);
|
||||
}
|
||||
else {
|
||||
this.logger.debug('CallSession:propagateAnswer - call already answered - re-anchor media with a reinvite');
|
||||
await this.dlg.modify(this.ep.local.sdp);
|
||||
}
|
||||
}
|
||||
|
||||
async _onRequestWithinDialog(req, res) {
|
||||
|
||||
@@ -125,10 +125,12 @@ class TaskDub extends TtsTask {
|
||||
const path = filepath[0];
|
||||
if (!path.startsWith('say:{')) {
|
||||
/* we have a local file of mp3 or r8 of synthesized speech audio to play */
|
||||
this.logger.info(`playing synthesized speech from file on track ${this.track}: ${path}`);
|
||||
this.play = path;
|
||||
await this._playOnTrack(cs, ep);
|
||||
}
|
||||
else {
|
||||
this.logger.info(`doing actual text to speech file on track ${this.track}: ${path}`);
|
||||
await ep.dub({
|
||||
action: 'sayOnTrack',
|
||||
track: this.track,
|
||||
|
||||
@@ -338,6 +338,7 @@ class TaskEnqueue extends Task {
|
||||
this.logger.error({err}, `TaskEnqueue:_playHook error retrieving list info for queue ${this.queueName}`);
|
||||
}
|
||||
const json = await cs.application.requestor.request('verb:hook', hook, params, httpHeaders);
|
||||
this.logger.debug({json}, 'TaskEnqueue:_playHook: received response from waitHook');
|
||||
const tasks = normalizeJambones(this.logger, json).map((tdata) => makeTask(this.logger, tdata));
|
||||
|
||||
const allowedTasks = tasks.filter((t) => allowed.includes(t.name));
|
||||
|
||||
@@ -58,7 +58,7 @@ class TaskTranscribe extends SttTask {
|
||||
this.isContinuousAsr = true;
|
||||
}
|
||||
/* buffer speech for continuous asr */
|
||||
this._bufferedTranscripts = [];
|
||||
this._bufferedTranscripts = [ [], [] ]; // for channel 1 and 2
|
||||
this.bugname_prefix = 'transcribe_';
|
||||
this.paused = false;
|
||||
}
|
||||
@@ -326,6 +326,7 @@ class TaskTranscribe extends SttTask {
|
||||
// make sure this is not a transcript from answering machine detection
|
||||
const bugname = fsEvent.getHeader('media-bugname');
|
||||
const finished = fsEvent.getHeader('transcription-session-finished');
|
||||
const bufferedTranscripts = this._bufferedTranscripts[channel - 1];
|
||||
if (bugname && this.bugname !== bugname) return;
|
||||
if (this.paused) {
|
||||
this.logger.debug({evt}, 'TaskTranscribe:_onTranscription - paused, ignoring transcript');
|
||||
@@ -335,14 +336,14 @@ class TaskTranscribe extends SttTask {
|
||||
|
||||
if (this.vendor === 'deepgram' && evt.type === 'UtteranceEnd') {
|
||||
/* we will only get this when we have set utterance_end_ms */
|
||||
if (this._bufferedTranscripts.length === 0) {
|
||||
if (bufferedTranscripts.length === 0) {
|
||||
this.logger.debug('Gather:_onTranscription - got UtteranceEnd event from deepgram but no buffered transcripts');
|
||||
}
|
||||
else {
|
||||
this.logger.debug('Gather:_onTranscription - got UtteranceEnd event from deepgram, return buffered transcript');
|
||||
evt = this.consolidateTranscripts(this._bufferedTranscripts, 1, this.language, this.vendor);
|
||||
evt = this.consolidateTranscripts(bufferedTranscripts, channel, this.language, this.vendor);
|
||||
evt.is_final = true;
|
||||
this._bufferedTranscripts = [];
|
||||
this._bufferedTranscripts[channel - 1] = [];
|
||||
this._resolve(channel, evt);
|
||||
}
|
||||
return;
|
||||
@@ -359,11 +360,11 @@ class TaskTranscribe extends SttTask {
|
||||
|
||||
let emptyTranscript = false;
|
||||
if (evt.is_final) {
|
||||
if (evt.alternatives[0].transcript === '' && !cs.callGone && !this.killed) {
|
||||
if (evt.alternatives.length === 0 || evt.alternatives[0].transcript === '' && !cs.callGone && !this.killed) {
|
||||
emptyTranscript = true;
|
||||
if (finished === 'true' &&
|
||||
['microsoft', 'deepgram'].includes(this.vendor) &&
|
||||
this._bufferedTranscripts.length === 0) {
|
||||
bufferedTranscripts.length === 0) {
|
||||
this.logger.debug({evt}, 'TaskGather:_onTranscription - got empty transcript from old gather, disregarding');
|
||||
return;
|
||||
}
|
||||
@@ -376,7 +377,7 @@ class TaskTranscribe extends SttTask {
|
||||
'TaskGather:_onTranscription - got empty deepgram transcript during continous asr, continue listening');
|
||||
return;
|
||||
}
|
||||
else if (this.vendor === 'deepgram' && this._bufferedTranscripts.length > 0) {
|
||||
else if (this.vendor === 'deepgram' && bufferedTranscripts.length > 0) {
|
||||
this.logger.info({evt},
|
||||
'TaskGather:_onTranscription - got empty transcript from deepgram, return the buffered transcripts');
|
||||
}
|
||||
@@ -392,11 +393,12 @@ class TaskTranscribe extends SttTask {
|
||||
}
|
||||
}
|
||||
this.logger.info({evt}, 'TaskGather:_onTranscription - got transcript during continous asr');
|
||||
this._bufferedTranscripts.push(evt);
|
||||
bufferedTranscripts.push(evt);
|
||||
this._startAsrTimer(channel);
|
||||
|
||||
/* some STT engines will keep listening after a final response, so no need to restart */
|
||||
if (!['soniox', 'aws', 'microsoft', 'deepgram'].includes(this.vendor)) this._startTranscribing(cs, ep, channel);
|
||||
if (!['soniox', 'aws', 'microsoft', 'deepgram', 'google']
|
||||
.includes(this.vendor)) this._startTranscribing(cs, ep, channel);
|
||||
}
|
||||
else {
|
||||
if (this.vendor === 'soniox') {
|
||||
@@ -407,19 +409,20 @@ class TaskTranscribe extends SttTask {
|
||||
}
|
||||
else if (this.vendor === 'deepgram') {
|
||||
/* compile transcripts into one */
|
||||
if (!emptyTranscript) this._bufferedTranscripts.push(evt);
|
||||
if (!emptyTranscript) bufferedTranscripts.push(evt);
|
||||
|
||||
/* deepgram can send an empty and final transcript; only if we have any buffered should we resolve */
|
||||
if (this._bufferedTranscripts.length === 0) return;
|
||||
evt = this.consolidateTranscripts(this._bufferedTranscripts, channel, this.language);
|
||||
this._bufferedTranscripts = [];
|
||||
if (bufferedTranscripts.length === 0) return;
|
||||
evt = this.consolidateTranscripts(bufferedTranscripts, channel, this.language);
|
||||
this._bufferedTranscripts[channel - 1] = [];
|
||||
}
|
||||
|
||||
/* here is where we return a final transcript */
|
||||
this.logger.debug({evt}, 'TaskTranscribe:_onTranscription - sending final transcript');
|
||||
this._resolve(channel, evt);
|
||||
/* some STT engines will keep listening after a final response, so no need to restart */
|
||||
if (!['soniox', 'aws', 'microsoft', 'deepgram'].includes(this.vendor)) this._startTranscribing(cs, ep, channel);
|
||||
if (!['soniox', 'aws', 'microsoft', 'deepgram', 'google']
|
||||
.includes(this.vendor)) this._startTranscribing(cs, ep, channel);
|
||||
}
|
||||
}
|
||||
else {
|
||||
@@ -430,7 +433,7 @@ class TaskTranscribe extends SttTask {
|
||||
const originalEvent = evt.vendor.evt;
|
||||
if (originalEvent.is_final && evt.alternatives[0].transcript !== '') {
|
||||
this.logger.debug({evt}, 'Gather:_onTranscription - buffering a completed (partial) deepgram transcript');
|
||||
this._bufferedTranscripts.push(evt);
|
||||
bufferedTranscripts.push(evt);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -591,8 +594,9 @@ class TaskTranscribe extends SttTask {
|
||||
this._clearAsrTimer(channel);
|
||||
this._asrTimer = setTimeout(() => {
|
||||
this.logger.debug(`TaskTranscribe:_startAsrTimer - asr timer went off for channel: ${channel}`);
|
||||
const evt = this.consolidateTranscripts(this._bufferedTranscripts, channel, this.language, this.vendor);
|
||||
this._bufferedTranscripts = [];
|
||||
const evt = this.consolidateTranscripts(
|
||||
this._bufferedTranscripts[channel - 1], channel, this.language, this.vendor);
|
||||
this._bufferedTranscripts[channel - 1] = [];
|
||||
this._resolve(channel, evt);
|
||||
}, this.asrTimeout);
|
||||
this.logger.debug(`TaskTranscribe:_startAsrTimer: set for ${this.asrTimeout}ms for channel ${channel}`);
|
||||
|
||||
@@ -171,6 +171,7 @@
|
||||
"session:new",
|
||||
"session:reconnect",
|
||||
"session:redirect",
|
||||
"session:adulting",
|
||||
"call:status",
|
||||
"queue:status",
|
||||
"dial:confirm",
|
||||
|
||||
@@ -413,6 +413,7 @@ class SingleDialer extends Emitter {
|
||||
const app = {...application};
|
||||
if ('WS' === app.call_hook?.method ||
|
||||
app.call_hook?.url.startsWith('ws://') || app.call_hook?.url.startsWith('wss://')) {
|
||||
if (app.call_hook?.url) app.call_hook.url += '/adulting';
|
||||
const requestor = new WsRequestor(logger, this.accountInfo.account.account_sid,
|
||||
app.call_hook, this.accountInfo.account.webhook_secret);
|
||||
app.requestor = requestor;
|
||||
@@ -438,6 +439,13 @@ class SingleDialer extends Emitter {
|
||||
tasks,
|
||||
rootSpan
|
||||
});
|
||||
app.requestor.request('session:adulting', '/adulting', {
|
||||
...cs.callInfo.toJSON(),
|
||||
parentCallInfo: this.parentCallInfo
|
||||
}).catch((err) => {
|
||||
newLogger.error({err}, 'doAdulting: error sending adulting request');
|
||||
});
|
||||
|
||||
cs.req = this.req;
|
||||
cs.exec().catch((err) => newLogger.error({err}, 'doAdulting: error executing session'));
|
||||
return cs;
|
||||
|
||||
@@ -270,7 +270,7 @@ const normalizeDeepgram = (evt, channel, language, shortUtterance) => {
|
||||
language_code: language,
|
||||
channel_tag: channel,
|
||||
is_final: shortUtterance ? evt.is_final : evt.speech_final,
|
||||
alternatives: [alternatives[0]],
|
||||
alternatives: alternatives.length ? [alternatives[0]] : [],
|
||||
vendor: {
|
||||
name: 'deepgram',
|
||||
evt: copy
|
||||
@@ -537,7 +537,11 @@ module.exports = (logger) => {
|
||||
}),
|
||||
...(rOpts.googleOptions?.enableVoiceActivityEvents && {
|
||||
GOOGLE_SPEECH_ENABLE_VOICE_ACTIVITY_EVENTS: rOpts.googleOptions.enableVoiceActivityEvents
|
||||
})
|
||||
}),
|
||||
...(rOpts.sgoogleOptions?.recognizerId) && {GOOGLE_SPEECH_RECOGNIZER_ID: rOpts.googleOptions.recognizerId},
|
||||
...(rOpts.googleOptions?.enableVoiceActivityEvents && {
|
||||
GOOGLE_SPEECH_ENABLE_VOICE_ACTIVITY_EVENTS: rOpts.googleOptions.enableVoiceActivityEvents
|
||||
}),
|
||||
}),
|
||||
};
|
||||
}
|
||||
|
||||
@@ -119,7 +119,7 @@ class WsRequestor extends BaseRequestor {
|
||||
type,
|
||||
msgid,
|
||||
call_sid: this.call_sid,
|
||||
hook: type === 'verb:hook' ? url : undefined,
|
||||
hook: ['verb:hook', 'session:redirect'].includes(type) ? url : undefined,
|
||||
data: {...payload},
|
||||
...b3
|
||||
};
|
||||
@@ -346,7 +346,9 @@ class WsRequestor extends BaseRequestor {
|
||||
/* messages must be JSON format */
|
||||
try {
|
||||
const obj = JSON.parse(content);
|
||||
const {type, msgid, command, call_sid = this.call_sid, queueCommand = false, data} = obj;
|
||||
//const {type, msgid, command, call_sid = this.call_sid, queueCommand = false, data} = obj;
|
||||
const {type, msgid, command, queueCommand = false, data} = obj;
|
||||
const call_sid = obj.callSid || this.call_sid;
|
||||
|
||||
//this.logger.debug({obj}, 'WsRequestor:request websocket: received');
|
||||
assert.ok(type, 'type property not supplied');
|
||||
|
||||
Reference in New Issue
Block a user