deepgram: rework continuous asr, and resolve on speech_final not is_f… (#501)

* deepgram: rework continuous asr, and resolve on speech_final not is_final (wip)

* wip

* deepgram: empty final transcript should trigger resolve with speech if we have buffered transcripts

* wip

* fixes for deepgram compiling multiple transcripts

* test deepgram utteranceEndMs

* more handling of utteranceEndMs

* wip

* better handling of digit strings collected over multiple deepgram responses

* wip

* add support for deepgramOptions.shortUtterance which triggers off of is_final instead of speech_final

* apply deepgram fixes to transcribe

* cleanup continnuous asr

* more continuous asr fixes for deepgram

* update to verb-specifications for handling SttTask properties

* set log level for tests back to error
This commit is contained in:
Dave Horton
2023-10-30 13:57:25 -04:00
committed by GitHub
parent 67f8f7181a
commit f43a5c1491
4 changed files with 179 additions and 60 deletions

View File

@@ -20,16 +20,6 @@ const makeTask = require('./make_task');
const assert = require('assert');
const SttTask = require('./stt-task');
const compileTranscripts = (logger, evt, arr) => {
if (!Array.isArray(arr) || arr.length === 0) return;
let t = '';
for (const a of arr) {
t += ` ${a.alternatives[0].transcript}`;
}
t += ` ${evt.alternatives[0].transcript}`;
evt.alternatives[0].transcript = t.trim();
};
class TaskGather extends SttTask {
constructor(logger, opts, parentTask) {
super(logger, opts, parentTask);
@@ -51,8 +41,10 @@ class TaskGather extends SttTask {
/* continuous ASR (i.e. compile transcripts until a special timeout or dtmf key) */
this.asrTimeout = typeof this.data.recognizer.asrTimeout === 'number' ?
this.data.recognizer.asrTimeout * 1000 : 0;
if (this.asrTimeout > 0) this.asrDtmfTerminationDigit = this.data.recognizer.asrDtmfTerminationDigit;
this.isContinuousAsr = this.asrTimeout > 0;
if (this.asrTimeout > 0) {
this.isContinuousAsr = true;
this.asrDtmfTerminationDigit = this.data.recognizer.asrDtmfTerminationDigit;
}
if (Array.isArray(this.data.recognizer.hints) &&
0 == this.data.recognizer.hints.length && JAMBONES_GATHER_CLEAR_GLOBAL_HINTS_ON_EMPTY_HINTS) {
@@ -351,6 +343,13 @@ class TaskGather extends SttTask {
async _setSpeechHandlers(cs, ep) {
if (this._speechHandlersSet) return;
this._speechHandlersSet = true;
/* some special deepgram logic */
if (this.vendor === 'deepgram') {
if (this.isContinuousAsr) this._doContinuousAsrWithDeepgram(this.asrTimeout);
if (this.data.recognizer?.deepgramOptions?.shortUtterance) this.shortUtterance = true;
}
const opts = this.setChannelVarsForStt(this, this.sttCredentials, this.data.recognizer);
switch (this.vendor) {
case 'google':
@@ -396,6 +395,9 @@ class TaskGather extends SttTask {
ep.addCustomEventListener(DeepgramTranscriptionEvents.Connect, this._onDeepgramConnect.bind(this, cs, ep));
ep.addCustomEventListener(DeepgramTranscriptionEvents.ConnectFailure,
this._onDeepGramConnectFailure.bind(this, cs, ep));
/* if app sets deepgramOptions.utteranceEndMs they essentially want continuous asr */
if (opts.DEEPGRAM_SPEECH_UTTERANCE_END_MS) this.isContinuousAsr = true;
break;
case 'soniox':
@@ -487,6 +489,12 @@ class TaskGather extends SttTask {
interim: this.interim,
bugname: this.bugname
}, 'Gather:_startTranscribing');
/**
* Note: we don't need to ask deepgram for interim results, because they
* already send us words as they are finalized (is_final=true) even before
* the utterance is finalized (speech_final=true)
*/
ep.startTranscription({
vendor: this.vendor,
locale: this.language,
@@ -522,11 +530,13 @@ class TaskGather extends SttTask {
}
_startAsrTimer() {
if (this.vendor === 'deepgram') return; // no need
assert(this.isContinuousAsr);
this._clearAsrTimer();
this._asrTimer = setTimeout(() => {
this.logger.debug('_startAsrTimer - asr timer went off');
this._resolve(this._bufferedTranscripts.length > 0 ? 'speech' : 'timeout');
const evt = this.consolidateTranscripts(this._bufferedTranscripts, 1, this.language);
this._resolve(this._bufferedTranscripts.length > 0 ? 'speech' : 'timeout', evt);
}, this.asrTimeout);
this.logger.debug(`_startAsrTimer: set for ${this.asrTimeout}ms`);
}
@@ -556,7 +566,8 @@ class TaskGather extends SttTask {
this._clearFinalAsrTimer();
this._finalAsrTimer = setTimeout(() => {
this.logger.debug('_startFinalAsrTimer - final asr timer went off');
this._resolve(this._bufferedTranscripts.length > 0 ? 'speech' : 'timeout');
const evt = this.consolidateTranscripts(this._bufferedTranscripts, 1, this.language);
this._resolve(this._bufferedTranscripts.length > 0 ? 'speech' : 'timeout', evt);
}, 1000);
this.logger.debug('_startFinalAsrTimer: set for 1 second');
}
@@ -595,11 +606,23 @@ class TaskGather extends SttTask {
this.logger.debug({evt, bugname, finished}, `Gather:_onTranscription for vendor ${this.vendor}`);
if (bugname && this.bugname !== bugname) return;
if (this.vendor === 'ibm') {
if (evt?.state === 'listening') return;
if (this.vendor === 'ibm' && evt?.state === 'listening') return;
if (this.vendor === 'deepgram' && evt.type === 'UtteranceEnd') {
/* we will only get this when we have set utterance_end_ms */
if (this._bufferedTranscripts.length === 0) {
this.logger.debug('Gather:_onTranscription - got UtteranceEnd event from deepgram but no buffered transcripts');
}
else {
this.logger.debug('Gather:_onTranscription - got UtteranceEnd event from deepgram, return buffered transcript');
evt = this.consolidateTranscripts(this._bufferedTranscripts, 1, this.language);
this._bufferedTranscripts = [];
this._resolve('speech', evt);
}
return;
}
evt = this.normalizeTranscription(evt, this.vendor, 1, this.language);
evt = this.normalizeTranscription(evt, this.vendor, 1, this.language, this.shortUtterance);
if (evt.alternatives.length === 0) {
this.logger.info({evt}, 'TaskGather:_onTranscription - got empty transcript, continue listening');
return;
@@ -621,15 +644,27 @@ class TaskGather extends SttTask {
const bufferedWords = this._sonioxTranscripts.length +
this._bufferedTranscripts.reduce((count, e) => count + e.alternatives[0]?.transcript.split(' ').length, 0);
let emptyTranscript = false;
if (evt.is_final) {
if (evt.alternatives[0].transcript === '' && !this.callSession.callGone && !this.killed) {
emptyTranscript = true;
if (finished === 'true' && ['microsoft', 'deepgram'].includes(this.vendor)) {
this.logger.debug({evt}, 'TaskGather:_onTranscription - got empty transcript from old gather, disregarding');
return;
}
else {
else if (this.vendor !== 'deepgram') {
this.logger.info({evt}, 'TaskGather:_onTranscription - got empty transcript, continue listening');
return;
}
else if (this.isContinuousAsr) {
this.logger.info({evt},
'TaskGather:_onTranscription - got empty deepgram transcript during continous asr, continue listening');
return;
}
else if (this.vendor === 'deepgram' && this._bufferedTranscripts.length > 0) {
this.logger.info({evt},
'TaskGather:_onTranscription - got empty transcript from deepgram, return the buffered transcripts');
}
return;
}
if (this.isContinuousAsr) {
@@ -641,14 +676,14 @@ class TaskGather extends SttTask {
this.logger.debug('TaskGather:_onTranscription - removing trailing punctuation');
evt.alternatives[0].transcript = t.slice(0, -1);
}
else this.logger.debug({t}, 'TaskGather:_onTranscription - no trailing punctuation');
}
this.logger.info({evt}, 'TaskGather:_onTranscription - got transcript during continous asr');
this._bufferedTranscripts.push(evt);
this._clearTimer();
if (this._finalAsrTimer) {
this._clearFinalAsrTimer();
return this._resolve(this._bufferedTranscripts.length > 0 ? 'speech' : 'timeout');
const evt = this.consolidateTranscripts(this._bufferedTranscripts, 1, this.language);
return this._resolve(this._bufferedTranscripts.length > 0 ? 'speech' : 'timeout', evt);
}
this._startAsrTimer();
@@ -670,16 +705,25 @@ class TaskGather extends SttTask {
evt = this.compileSonioxTranscripts(this._sonioxTranscripts, 1, this.language);
this._sonioxTranscripts = [];
}
else if (this.vendor === 'deepgram') {
/* compile transcripts into one */
if (!emptyTranscript) this._bufferedTranscripts.push(evt);
if (this.data.recognizer?.deepgramOptions?.utteranceEndMs) {
this.logger.debug('TaskGather:_onTranscription - got speech_final waiting for UtteranceEnd event');
return;
}
this.logger.debug({evt}, 'TaskGather:_onTranscription - compiling deepgram transcripts');
evt = this.consolidateTranscripts(this._bufferedTranscripts, 1, this.language);
this._bufferedTranscripts = [];
this.logger.debug({evt}, 'TaskGather:_onTranscription - compiled deepgram transcripts');
}
/* here is where we return a final transcript */
this._resolve('speech', evt);
}
}
}
else {
/* google has a measure of stability:
https://cloud.google.com/speech-to-text/docs/basics#streaming_responses
others do not.
*/
//const isStableEnough = typeof evt.stability === 'undefined' || evt.stability > GATHER_STABILITY_THRESHOLD;
this._clearTimer();
this._startTimer();
if (this.bargein && (words + bufferedWords) >= this.minBargeinWordCount) {
@@ -705,6 +749,14 @@ class TaskGather extends SttTask {
this._sonioxTranscripts.push(evt.vendor.finalWords);
}
}
/* deepgram can send a non-final transcript but with words that are final, so we need to buffer */
if (this.vendor === 'deepgram') {
const originalEvent = evt.vendor.evt;
if (originalEvent.is_final && evt.alternatives[0].transcript !== '') {
this.logger.debug({evt}, 'Gather:_onTranscription - buffering a completed (partial) deepgram transcript');
this._bufferedTranscripts.push(evt);
}
}
}
}
_onEndOfUtterance(cs, ep) {
@@ -719,7 +771,7 @@ class TaskGather extends SttTask {
* getting a transcription. This can happen if someone coughs or mumbles.
* For that reason don't ask for a single utterance and we'll terminate the transcribe operation
* once we get a final transcript.
* However, if the usr has specified a singleUtterance, then we need to restart here
* However, if the user has specified a singleUtterance, then we need to restart here
* since we dont have a final transcript yet.
*/
if (!this.resolved && !this.killed && !this._bufferedTranscripts.length && this.wantsSingleUtterance) {
@@ -858,18 +910,6 @@ class TaskGather extends SttTask {
this._clearTimer();
this._clearFastRecognitionTimer();
if (this.isContinuousAsr && reason.startsWith('speech')) {
evt = {
is_final: true,
transcripts: this._bufferedTranscripts
};
this.logger.debug({evt}, 'TaskGather:resolve continuous asr');
}
else if (!this.isContinuousAsr && reason.startsWith('speech') && this._bufferedTranscripts.length) {
compileTranscripts(this.logger, evt, this._bufferedTranscripts);
this.logger.debug({evt}, 'TaskGather:resolve buffered results');
}
this.span.setAttributes({
channel: 1,
'stt.resolve': reason,