From e1497f90a8b07b57c979a4b2b193c3362ff8136b Mon Sep 17 00:00:00 2001 From: Dave Horton Date: Mon, 1 Apr 2024 13:03:52 -0400 Subject: [PATCH] update with various deepgram fixes, including for #700 --- lib/tasks/transcribe.js | 101 +++++++++++++++++++++++++++++++--------- 1 file changed, 80 insertions(+), 21 deletions(-) diff --git a/lib/tasks/transcribe.js b/lib/tasks/transcribe.js index 32059980..a5f9d0a6 100644 --- a/lib/tasks/transcribe.js +++ b/lib/tasks/transcribe.js @@ -303,12 +303,12 @@ class TaskTranscribe extends SttTask { async _onTranscription(cs, ep, channel, evt, fsEvent) { // make sure this is not a transcript from answering machine detection const bugname = fsEvent.getHeader('media-bugname'); + const finished = fsEvent.getHeader('transcription-session-finished'); if (bugname && this.bugname !== bugname) return; if (this.paused) { this.logger.debug({evt}, 'TaskTranscribe:_onTranscription - paused, ignoring transcript'); } - if (this.vendor === 'ibm' && evt?.state === 'listening') return; if (this.vendor === 'deepgram' && evt.type === 'UtteranceEnd') { @@ -319,8 +319,9 @@ class TaskTranscribe extends SttTask { else { this.logger.debug('Gather:_onTranscription - got UtteranceEnd event from deepgram, return buffered transcript'); evt = this.consolidateTranscripts(this._bufferedTranscripts, 1, this.language, this.vendor); + evt.is_final = true; this._bufferedTranscripts = []; - this._resolve('speech', evt); + this._resolve(channel, evt); } return; } @@ -334,31 +335,89 @@ class TaskTranscribe extends SttTask { return; } - if (evt.alternatives[0]?.transcript === '' && !cs.callGone && !this.killed) { - if (['microsoft', 'deepgram'].includes(this.vendor)) { - this.logger.info({evt}, 'TaskTranscribe:_onTranscription - got empty transcript, continue listening'); + let emptyTranscript = false; + if (evt.is_final) { + if (evt.alternatives[0].transcript === '' && !cs.callGone && !this.killed) { + emptyTranscript = true; + if (finished === 'true' && + ['microsoft', 'deepgram'].includes(this.vendor) && + this._bufferedTranscripts.length === 0) { + this.logger.debug({evt}, 'TaskGather:_onTranscription - got empty transcript from old gather, disregarding'); + return; + } + else if (this.vendor !== 'deepgram') { + this.logger.info({evt}, 'TaskGather:_onTranscription - got empty transcript, continue listening'); + return; + } + else if (this.isContinuousAsr) { + this.logger.info({evt}, + 'TaskGather:_onTranscription - got empty deepgram transcript during continous asr, continue listening'); + return; + } + else if (this.vendor === 'deepgram' && this._bufferedTranscripts.length > 0) { + this.logger.info({evt}, + 'TaskGather:_onTranscription - got empty transcript from deepgram, return the buffered transcripts'); + } + } + if (this.isContinuousAsr) { + /* append the transcript and start listening again for asrTimeout */ + const t = evt.alternatives[0].transcript; + if (t) { + /* remove trailing punctuation */ + if (/[,;:\.!\?]$/.test(t)) { + this.logger.debug('TaskGather:_onTranscription - removing trailing punctuation'); + evt.alternatives[0].transcript = t.slice(0, -1); + } + } + this.logger.info({evt}, 'TaskGather:_onTranscription - got transcript during continous asr'); + this._bufferedTranscripts.push(evt); + this._startAsrTimer(channel); + + /* some STT engines will keep listening after a final response, so no need to restart */ + if (!['soniox', 'aws', 'microsoft', 'deepgram', 'google'] + .includes(this.vendor)) this._startTranscribing(cs, ep, channel); } else { - this.logger.info({evt}, 'TaskTranscribe:_onTranscription - got empty transcript, listen again'); - this._transcribe(ep); - } - return; - } + if (this.vendor === 'soniox') { + /* compile transcripts into one */ + this._sonioxTranscripts.push(evt.vendor.finalWords); + evt = this.compileSonioxTranscripts(this._sonioxTranscripts, 1, this.language); + this._sonioxTranscripts = []; + } + else if (this.vendor === 'deepgram') { + /* compile transcripts into one */ + if (!emptyTranscript) this._bufferedTranscripts.push(evt); - if (this.vendor === 'soniox') { - /* compile transcripts into one */ - this._sonioxTranscripts.push(evt.vendor.finalWords); - if (evt.is_final) { - evt = this.compileSonioxTranscripts(this._sonioxTranscripts, 1, this.language); - this._sonioxTranscripts = []; + /* deepgram can send an empty and final transcript; only if we have any buffered should we resolve */ + if (this._bufferedTranscripts.length === 0) return; + evt = this.consolidateTranscripts(this._bufferedTranscripts, channel, this.language); + this._bufferedTranscripts = []; + } + + /* here is where we return a final transcript */ + this.logger.debug({evt}, 'TaskTranscribe:_onTranscription - sending final transcript'); + this._resolve(channel, evt); + /* some STT engines will keep listening after a final response, so no need to restart */ + if (!['soniox', 'aws', 'microsoft', 'deepgram', 'google'] + .includes(this.vendor)) this._startTranscribing(cs, ep, channel); } } + else { + /* interim transcript */ - if (this.isContinuousAsr && evt.is_final) { - this._bufferedTranscripts.push(evt); - this._startAsrTimer(channel); - } else { - await this._resolve(channel, evt); + /* deepgram can send a non-final transcript but with words that are final, so we need to buffer */ + if (this.vendor === 'deepgram') { + const originalEvent = evt.vendor.evt; + if (originalEvent.is_final && evt.alternatives[0].transcript !== '') { + this.logger.debug({evt}, 'Gather:_onTranscription - buffering a completed (partial) deepgram transcript'); + this._bufferedTranscripts.push(evt); + } + } + + if (this.interim) { + this.logger.debug({evt}, 'TaskTranscribe:_onTranscription - sending interim transcript'); + this._resolve(channel, evt); + } } }