mirror of
https://github.com/jambonz/jambonz-feature-server.git
synced 2026-02-12 09:19:34 +00:00
Feat/ambient sounds (#678)
* initial support for coaching mode in conference * wip * wip * add support for answer verb * wip * wip * wip * wip * wip * updates to rename option to dub * wip * wip * wip * update verb-specs * wip * wip * wip * wip * wip * wip * wip * wip * add option to boost audio signal in main channel * wip * wip * wip * wip * wip * wip * for now, bypass use of streaming apis when generating tts audio for dub tracks * add nested dub to dial * wip * add support for filler noise * kill filler noise when gather killed * wip * wip * while using sayOnTrack, we have to enclose the say command in double quotes * disableTtsStreaming = false * allow transcribe of b leg only on dial verb * dub.say can either be text or object like say verb with text and synthesizer * remove loop for sayOnTrack * update speech-utils * fixes for testing transcribe verb and support for dub and boostAudioSignal in lcc commands * add dial.boostAudioSignal * fix bug where session-level recognizer settings incorrectly overwrite verb-level settings * update verb specs * update dial to support array of dub verbs * fix bug setting gain * lint * wip * update speech-utils * use new endpoint methods for mod_dub --------- Co-authored-by: Dave Horton <daveh@beachdognet.com>
This commit is contained in:
@@ -32,8 +32,22 @@ class TaskTranscribe extends SttTask {
|
||||
}
|
||||
|
||||
/* for nested transcribe in dial, unless the app explicitly says so we want to transcribe both legs */
|
||||
if (this.parentTask?.name === TaskName.Dial && this.separateRecognitionPerChannel !== false) {
|
||||
this.separateRecognitionPerChannel = true;
|
||||
if (this.parentTask?.name === TaskName.Dial) {
|
||||
if (this.data.channel === 1 || this.data.channel === 2) {
|
||||
/* transcribe only the channel specified */
|
||||
this.separateRecognitionPerChannel = false;
|
||||
this.channel = this.data.channel;
|
||||
logger.debug(`TaskTranscribe: transcribing only channel ${this.channel} in the Dial verb`);
|
||||
}
|
||||
else if (this.separateRecognitionPerChannel !== false) {
|
||||
this.separateRecognitionPerChannel = true;
|
||||
}
|
||||
else {
|
||||
this.channel = 1;
|
||||
}
|
||||
}
|
||||
else {
|
||||
this.channel = 1;
|
||||
}
|
||||
|
||||
this.childSpan = [null, null];
|
||||
@@ -51,6 +65,14 @@ class TaskTranscribe extends SttTask {
|
||||
|
||||
get name() { return TaskName.Transcribe; }
|
||||
|
||||
get transcribing1() {
|
||||
return this.channel === 1 || this.separateRecognitionPerChannel;
|
||||
}
|
||||
|
||||
get transcribing2() {
|
||||
return this.channel === 2 || this.separateRecognitionPerChannel && this.ep2;
|
||||
}
|
||||
|
||||
async exec(cs, {ep, ep2}) {
|
||||
await super.exec(cs, {ep, ep2});
|
||||
|
||||
@@ -73,8 +95,10 @@ class TaskTranscribe extends SttTask {
|
||||
}
|
||||
|
||||
try {
|
||||
await this._startTranscribing(cs, ep, 1);
|
||||
if (this.separateRecognitionPerChannel && ep2) {
|
||||
if (this.transcribing1) {
|
||||
await this._startTranscribing(cs, ep, 1);
|
||||
}
|
||||
if (this.transcribing2) {
|
||||
await this._startTranscribing(cs, ep2, 2);
|
||||
}
|
||||
|
||||
@@ -91,7 +115,7 @@ class TaskTranscribe extends SttTask {
|
||||
|
||||
async _stopTranscription() {
|
||||
let stopTranscription = false;
|
||||
if (this.ep?.connected) {
|
||||
if (this.transcribing1 && this.ep?.connected) {
|
||||
stopTranscription = true;
|
||||
this.ep.stopTranscription({
|
||||
vendor: this.vendor,
|
||||
@@ -99,7 +123,7 @@ class TaskTranscribe extends SttTask {
|
||||
})
|
||||
.catch((err) => this.logger.info(err, 'Error TaskTranscribe:kill'));
|
||||
}
|
||||
if (this.separateRecognitionPerChannel && this.ep2 && this.ep2.connected) {
|
||||
if (this.transcribing2 && this.ep2.connected) {
|
||||
stopTranscription = true;
|
||||
this.ep2.stopTranscription({vendor: this.vendor, bugname: this.bugname})
|
||||
.catch((err) => this.logger.info(err, 'Error TaskTranscribe:kill'));
|
||||
@@ -128,10 +152,8 @@ class TaskTranscribe extends SttTask {
|
||||
break;
|
||||
case TranscribeStatus.Resume:
|
||||
this.paused = false;
|
||||
await this._startTranscribing(this.cs, this.ep, 1);
|
||||
if (this.separateRecognitionPerChannel && this.ep2) {
|
||||
await this._startTranscribing(this.cs, this.ep2, 2);
|
||||
}
|
||||
if (this.transcribing1) await this._startTranscribing(this.cs, this.ep, 1);
|
||||
if (this.transcribing2) await this._startTranscribing(this.cs, this.ep2, 2);
|
||||
break;
|
||||
}
|
||||
}
|
||||
@@ -294,7 +316,7 @@ class TaskTranscribe extends SttTask {
|
||||
vendor: this.vendor,
|
||||
interim: this.interim ? true : false,
|
||||
locale: this.language,
|
||||
channels: /*this.separateRecognitionPerChannel ? 2 : */ 1,
|
||||
channels: 1,
|
||||
bugname: this.bugname,
|
||||
hostport: this.hostport
|
||||
});
|
||||
@@ -303,12 +325,12 @@ class TaskTranscribe extends SttTask {
|
||||
async _onTranscription(cs, ep, channel, evt, fsEvent) {
|
||||
// make sure this is not a transcript from answering machine detection
|
||||
const bugname = fsEvent.getHeader('media-bugname');
|
||||
const finished = fsEvent.getHeader('transcription-session-finished');
|
||||
if (bugname && this.bugname !== bugname) return;
|
||||
if (this.paused) {
|
||||
this.logger.debug({evt}, 'TaskTranscribe:_onTranscription - paused, ignoring transcript');
|
||||
}
|
||||
|
||||
|
||||
if (this.vendor === 'ibm' && evt?.state === 'listening') return;
|
||||
|
||||
if (this.vendor === 'deepgram' && evt.type === 'UtteranceEnd') {
|
||||
@@ -319,8 +341,9 @@ class TaskTranscribe extends SttTask {
|
||||
else {
|
||||
this.logger.debug('Gather:_onTranscription - got UtteranceEnd event from deepgram, return buffered transcript');
|
||||
evt = this.consolidateTranscripts(this._bufferedTranscripts, 1, this.language, this.vendor);
|
||||
evt.is_final = true;
|
||||
this._bufferedTranscripts = [];
|
||||
this._resolve('speech', evt);
|
||||
this._resolve(channel, evt);
|
||||
}
|
||||
return;
|
||||
}
|
||||
@@ -334,31 +357,87 @@ class TaskTranscribe extends SttTask {
|
||||
return;
|
||||
}
|
||||
|
||||
if (evt.alternatives[0]?.transcript === '' && !cs.callGone && !this.killed) {
|
||||
if (['microsoft', 'deepgram'].includes(this.vendor)) {
|
||||
this.logger.info({evt}, 'TaskTranscribe:_onTranscription - got empty transcript, continue listening');
|
||||
let emptyTranscript = false;
|
||||
if (evt.is_final) {
|
||||
if (evt.alternatives[0].transcript === '' && !cs.callGone && !this.killed) {
|
||||
emptyTranscript = true;
|
||||
if (finished === 'true' &&
|
||||
['microsoft', 'deepgram'].includes(this.vendor) &&
|
||||
this._bufferedTranscripts.length === 0) {
|
||||
this.logger.debug({evt}, 'TaskGather:_onTranscription - got empty transcript from old gather, disregarding');
|
||||
return;
|
||||
}
|
||||
else if (this.vendor !== 'deepgram') {
|
||||
this.logger.info({evt}, 'TaskGather:_onTranscription - got empty transcript, continue listening');
|
||||
return;
|
||||
}
|
||||
else if (this.isContinuousAsr) {
|
||||
this.logger.info({evt},
|
||||
'TaskGather:_onTranscription - got empty deepgram transcript during continous asr, continue listening');
|
||||
return;
|
||||
}
|
||||
else if (this.vendor === 'deepgram' && this._bufferedTranscripts.length > 0) {
|
||||
this.logger.info({evt},
|
||||
'TaskGather:_onTranscription - got empty transcript from deepgram, return the buffered transcripts');
|
||||
}
|
||||
}
|
||||
if (this.isContinuousAsr) {
|
||||
/* append the transcript and start listening again for asrTimeout */
|
||||
const t = evt.alternatives[0].transcript;
|
||||
if (t) {
|
||||
/* remove trailing punctuation */
|
||||
if (/[,;:\.!\?]$/.test(t)) {
|
||||
this.logger.debug('TaskGather:_onTranscription - removing trailing punctuation');
|
||||
evt.alternatives[0].transcript = t.slice(0, -1);
|
||||
}
|
||||
}
|
||||
this.logger.info({evt}, 'TaskGather:_onTranscription - got transcript during continous asr');
|
||||
this._bufferedTranscripts.push(evt);
|
||||
this._startAsrTimer(channel);
|
||||
|
||||
/* some STT engines will keep listening after a final response, so no need to restart */
|
||||
if (!['soniox', 'aws', 'microsoft', 'deepgram'].includes(this.vendor)) this._startTranscribing(cs, ep, channel);
|
||||
}
|
||||
else {
|
||||
this.logger.info({evt}, 'TaskTranscribe:_onTranscription - got empty transcript, listen again');
|
||||
this._transcribe(ep);
|
||||
}
|
||||
return;
|
||||
}
|
||||
if (this.vendor === 'soniox') {
|
||||
/* compile transcripts into one */
|
||||
this._sonioxTranscripts.push(evt.vendor.finalWords);
|
||||
evt = this.compileSonioxTranscripts(this._sonioxTranscripts, 1, this.language);
|
||||
this._sonioxTranscripts = [];
|
||||
}
|
||||
else if (this.vendor === 'deepgram') {
|
||||
/* compile transcripts into one */
|
||||
if (!emptyTranscript) this._bufferedTranscripts.push(evt);
|
||||
|
||||
if (this.vendor === 'soniox') {
|
||||
/* compile transcripts into one */
|
||||
this._sonioxTranscripts.push(evt.vendor.finalWords);
|
||||
if (evt.is_final) {
|
||||
evt = this.compileSonioxTranscripts(this._sonioxTranscripts, 1, this.language);
|
||||
this._sonioxTranscripts = [];
|
||||
/* deepgram can send an empty and final transcript; only if we have any buffered should we resolve */
|
||||
if (this._bufferedTranscripts.length === 0) return;
|
||||
evt = this.consolidateTranscripts(this._bufferedTranscripts, channel, this.language);
|
||||
this._bufferedTranscripts = [];
|
||||
}
|
||||
|
||||
/* here is where we return a final transcript */
|
||||
this.logger.debug({evt}, 'TaskTranscribe:_onTranscription - sending final transcript');
|
||||
this._resolve(channel, evt);
|
||||
/* some STT engines will keep listening after a final response, so no need to restart */
|
||||
if (!['soniox', 'aws', 'microsoft', 'deepgram'].includes(this.vendor)) this._startTranscribing(cs, ep, channel);
|
||||
}
|
||||
}
|
||||
else {
|
||||
/* interim transcript */
|
||||
|
||||
if (this.isContinuousAsr && evt.is_final) {
|
||||
this._bufferedTranscripts.push(evt);
|
||||
this._startAsrTimer(channel);
|
||||
} else {
|
||||
await this._resolve(channel, evt);
|
||||
/* deepgram can send a non-final transcript but with words that are final, so we need to buffer */
|
||||
if (this.vendor === 'deepgram') {
|
||||
const originalEvent = evt.vendor.evt;
|
||||
if (originalEvent.is_final && evt.alternatives[0].transcript !== '') {
|
||||
this.logger.debug({evt}, 'Gather:_onTranscription - buffering a completed (partial) deepgram transcript');
|
||||
this._bufferedTranscripts.push(evt);
|
||||
}
|
||||
}
|
||||
|
||||
if (this.interim) {
|
||||
this.logger.debug({evt}, 'TaskTranscribe:_onTranscription - sending interim transcript');
|
||||
this._resolve(channel, evt);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -479,6 +558,7 @@ class TaskTranscribe extends SttTask {
|
||||
|
||||
if (this.vendor === 'nuance') {
|
||||
const {code, error} = evt;
|
||||
//TODO: fix below, currently _resolve does not send timeout events
|
||||
if (code === 404 && error === 'No speech') return this._resolve('timeout');
|
||||
if (code === 413 && error === 'Too much speech') return this._resolve('timeout');
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user