mirror of
https://github.com/jambonz/jambonz-feature-server.git
synced 2025-12-20 16:50:39 +00:00
deepgram: rework continuous asr, and resolve on speech_final not is_f… (#501)
* deepgram: rework continuous asr, and resolve on speech_final not is_final (wip) * wip * deepgram: empty final transcript should trigger resolve with speech if we have buffered transcripts * wip * fixes for deepgram compiling multiple transcripts * test deepgram utteranceEndMs * more handling of utteranceEndMs * wip * better handling of digit strings collected over multiple deepgram responses * wip * add support for deepgramOptions.shortUtterance which triggers off of is_final instead of speech_final * apply deepgram fixes to transcribe * cleanup continnuous asr * more continuous asr fixes for deepgram * update to verb-specifications for handling SttTask properties * set log level for tests back to error
This commit is contained in:
@@ -20,16 +20,6 @@ const makeTask = require('./make_task');
|
|||||||
const assert = require('assert');
|
const assert = require('assert');
|
||||||
const SttTask = require('./stt-task');
|
const SttTask = require('./stt-task');
|
||||||
|
|
||||||
const compileTranscripts = (logger, evt, arr) => {
|
|
||||||
if (!Array.isArray(arr) || arr.length === 0) return;
|
|
||||||
let t = '';
|
|
||||||
for (const a of arr) {
|
|
||||||
t += ` ${a.alternatives[0].transcript}`;
|
|
||||||
}
|
|
||||||
t += ` ${evt.alternatives[0].transcript}`;
|
|
||||||
evt.alternatives[0].transcript = t.trim();
|
|
||||||
};
|
|
||||||
|
|
||||||
class TaskGather extends SttTask {
|
class TaskGather extends SttTask {
|
||||||
constructor(logger, opts, parentTask) {
|
constructor(logger, opts, parentTask) {
|
||||||
super(logger, opts, parentTask);
|
super(logger, opts, parentTask);
|
||||||
@@ -51,8 +41,10 @@ class TaskGather extends SttTask {
|
|||||||
/* continuous ASR (i.e. compile transcripts until a special timeout or dtmf key) */
|
/* continuous ASR (i.e. compile transcripts until a special timeout or dtmf key) */
|
||||||
this.asrTimeout = typeof this.data.recognizer.asrTimeout === 'number' ?
|
this.asrTimeout = typeof this.data.recognizer.asrTimeout === 'number' ?
|
||||||
this.data.recognizer.asrTimeout * 1000 : 0;
|
this.data.recognizer.asrTimeout * 1000 : 0;
|
||||||
if (this.asrTimeout > 0) this.asrDtmfTerminationDigit = this.data.recognizer.asrDtmfTerminationDigit;
|
if (this.asrTimeout > 0) {
|
||||||
this.isContinuousAsr = this.asrTimeout > 0;
|
this.isContinuousAsr = true;
|
||||||
|
this.asrDtmfTerminationDigit = this.data.recognizer.asrDtmfTerminationDigit;
|
||||||
|
}
|
||||||
|
|
||||||
if (Array.isArray(this.data.recognizer.hints) &&
|
if (Array.isArray(this.data.recognizer.hints) &&
|
||||||
0 == this.data.recognizer.hints.length && JAMBONES_GATHER_CLEAR_GLOBAL_HINTS_ON_EMPTY_HINTS) {
|
0 == this.data.recognizer.hints.length && JAMBONES_GATHER_CLEAR_GLOBAL_HINTS_ON_EMPTY_HINTS) {
|
||||||
@@ -351,6 +343,13 @@ class TaskGather extends SttTask {
|
|||||||
async _setSpeechHandlers(cs, ep) {
|
async _setSpeechHandlers(cs, ep) {
|
||||||
if (this._speechHandlersSet) return;
|
if (this._speechHandlersSet) return;
|
||||||
this._speechHandlersSet = true;
|
this._speechHandlersSet = true;
|
||||||
|
|
||||||
|
/* some special deepgram logic */
|
||||||
|
if (this.vendor === 'deepgram') {
|
||||||
|
if (this.isContinuousAsr) this._doContinuousAsrWithDeepgram(this.asrTimeout);
|
||||||
|
if (this.data.recognizer?.deepgramOptions?.shortUtterance) this.shortUtterance = true;
|
||||||
|
}
|
||||||
|
|
||||||
const opts = this.setChannelVarsForStt(this, this.sttCredentials, this.data.recognizer);
|
const opts = this.setChannelVarsForStt(this, this.sttCredentials, this.data.recognizer);
|
||||||
switch (this.vendor) {
|
switch (this.vendor) {
|
||||||
case 'google':
|
case 'google':
|
||||||
@@ -396,6 +395,9 @@ class TaskGather extends SttTask {
|
|||||||
ep.addCustomEventListener(DeepgramTranscriptionEvents.Connect, this._onDeepgramConnect.bind(this, cs, ep));
|
ep.addCustomEventListener(DeepgramTranscriptionEvents.Connect, this._onDeepgramConnect.bind(this, cs, ep));
|
||||||
ep.addCustomEventListener(DeepgramTranscriptionEvents.ConnectFailure,
|
ep.addCustomEventListener(DeepgramTranscriptionEvents.ConnectFailure,
|
||||||
this._onDeepGramConnectFailure.bind(this, cs, ep));
|
this._onDeepGramConnectFailure.bind(this, cs, ep));
|
||||||
|
|
||||||
|
/* if app sets deepgramOptions.utteranceEndMs they essentially want continuous asr */
|
||||||
|
if (opts.DEEPGRAM_SPEECH_UTTERANCE_END_MS) this.isContinuousAsr = true;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case 'soniox':
|
case 'soniox':
|
||||||
@@ -487,6 +489,12 @@ class TaskGather extends SttTask {
|
|||||||
interim: this.interim,
|
interim: this.interim,
|
||||||
bugname: this.bugname
|
bugname: this.bugname
|
||||||
}, 'Gather:_startTranscribing');
|
}, 'Gather:_startTranscribing');
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Note: we don't need to ask deepgram for interim results, because they
|
||||||
|
* already send us words as they are finalized (is_final=true) even before
|
||||||
|
* the utterance is finalized (speech_final=true)
|
||||||
|
*/
|
||||||
ep.startTranscription({
|
ep.startTranscription({
|
||||||
vendor: this.vendor,
|
vendor: this.vendor,
|
||||||
locale: this.language,
|
locale: this.language,
|
||||||
@@ -522,11 +530,13 @@ class TaskGather extends SttTask {
|
|||||||
}
|
}
|
||||||
|
|
||||||
_startAsrTimer() {
|
_startAsrTimer() {
|
||||||
|
if (this.vendor === 'deepgram') return; // no need
|
||||||
assert(this.isContinuousAsr);
|
assert(this.isContinuousAsr);
|
||||||
this._clearAsrTimer();
|
this._clearAsrTimer();
|
||||||
this._asrTimer = setTimeout(() => {
|
this._asrTimer = setTimeout(() => {
|
||||||
this.logger.debug('_startAsrTimer - asr timer went off');
|
this.logger.debug('_startAsrTimer - asr timer went off');
|
||||||
this._resolve(this._bufferedTranscripts.length > 0 ? 'speech' : 'timeout');
|
const evt = this.consolidateTranscripts(this._bufferedTranscripts, 1, this.language);
|
||||||
|
this._resolve(this._bufferedTranscripts.length > 0 ? 'speech' : 'timeout', evt);
|
||||||
}, this.asrTimeout);
|
}, this.asrTimeout);
|
||||||
this.logger.debug(`_startAsrTimer: set for ${this.asrTimeout}ms`);
|
this.logger.debug(`_startAsrTimer: set for ${this.asrTimeout}ms`);
|
||||||
}
|
}
|
||||||
@@ -556,7 +566,8 @@ class TaskGather extends SttTask {
|
|||||||
this._clearFinalAsrTimer();
|
this._clearFinalAsrTimer();
|
||||||
this._finalAsrTimer = setTimeout(() => {
|
this._finalAsrTimer = setTimeout(() => {
|
||||||
this.logger.debug('_startFinalAsrTimer - final asr timer went off');
|
this.logger.debug('_startFinalAsrTimer - final asr timer went off');
|
||||||
this._resolve(this._bufferedTranscripts.length > 0 ? 'speech' : 'timeout');
|
const evt = this.consolidateTranscripts(this._bufferedTranscripts, 1, this.language);
|
||||||
|
this._resolve(this._bufferedTranscripts.length > 0 ? 'speech' : 'timeout', evt);
|
||||||
}, 1000);
|
}, 1000);
|
||||||
this.logger.debug('_startFinalAsrTimer: set for 1 second');
|
this.logger.debug('_startFinalAsrTimer: set for 1 second');
|
||||||
}
|
}
|
||||||
@@ -595,11 +606,23 @@ class TaskGather extends SttTask {
|
|||||||
this.logger.debug({evt, bugname, finished}, `Gather:_onTranscription for vendor ${this.vendor}`);
|
this.logger.debug({evt, bugname, finished}, `Gather:_onTranscription for vendor ${this.vendor}`);
|
||||||
if (bugname && this.bugname !== bugname) return;
|
if (bugname && this.bugname !== bugname) return;
|
||||||
|
|
||||||
if (this.vendor === 'ibm') {
|
if (this.vendor === 'ibm' && evt?.state === 'listening') return;
|
||||||
if (evt?.state === 'listening') return;
|
|
||||||
|
if (this.vendor === 'deepgram' && evt.type === 'UtteranceEnd') {
|
||||||
|
/* we will only get this when we have set utterance_end_ms */
|
||||||
|
if (this._bufferedTranscripts.length === 0) {
|
||||||
|
this.logger.debug('Gather:_onTranscription - got UtteranceEnd event from deepgram but no buffered transcripts');
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
this.logger.debug('Gather:_onTranscription - got UtteranceEnd event from deepgram, return buffered transcript');
|
||||||
|
evt = this.consolidateTranscripts(this._bufferedTranscripts, 1, this.language);
|
||||||
|
this._bufferedTranscripts = [];
|
||||||
|
this._resolve('speech', evt);
|
||||||
|
}
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
evt = this.normalizeTranscription(evt, this.vendor, 1, this.language);
|
evt = this.normalizeTranscription(evt, this.vendor, 1, this.language, this.shortUtterance);
|
||||||
if (evt.alternatives.length === 0) {
|
if (evt.alternatives.length === 0) {
|
||||||
this.logger.info({evt}, 'TaskGather:_onTranscription - got empty transcript, continue listening');
|
this.logger.info({evt}, 'TaskGather:_onTranscription - got empty transcript, continue listening');
|
||||||
return;
|
return;
|
||||||
@@ -621,16 +644,28 @@ class TaskGather extends SttTask {
|
|||||||
const bufferedWords = this._sonioxTranscripts.length +
|
const bufferedWords = this._sonioxTranscripts.length +
|
||||||
this._bufferedTranscripts.reduce((count, e) => count + e.alternatives[0]?.transcript.split(' ').length, 0);
|
this._bufferedTranscripts.reduce((count, e) => count + e.alternatives[0]?.transcript.split(' ').length, 0);
|
||||||
|
|
||||||
|
let emptyTranscript = false;
|
||||||
if (evt.is_final) {
|
if (evt.is_final) {
|
||||||
if (evt.alternatives[0].transcript === '' && !this.callSession.callGone && !this.killed) {
|
if (evt.alternatives[0].transcript === '' && !this.callSession.callGone && !this.killed) {
|
||||||
|
emptyTranscript = true;
|
||||||
if (finished === 'true' && ['microsoft', 'deepgram'].includes(this.vendor)) {
|
if (finished === 'true' && ['microsoft', 'deepgram'].includes(this.vendor)) {
|
||||||
this.logger.debug({evt}, 'TaskGather:_onTranscription - got empty transcript from old gather, disregarding');
|
this.logger.debug({evt}, 'TaskGather:_onTranscription - got empty transcript from old gather, disregarding');
|
||||||
}
|
|
||||||
else {
|
|
||||||
this.logger.info({evt}, 'TaskGather:_onTranscription - got empty transcript, continue listening');
|
|
||||||
}
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
else if (this.vendor !== 'deepgram') {
|
||||||
|
this.logger.info({evt}, 'TaskGather:_onTranscription - got empty transcript, continue listening');
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
else if (this.isContinuousAsr) {
|
||||||
|
this.logger.info({evt},
|
||||||
|
'TaskGather:_onTranscription - got empty deepgram transcript during continous asr, continue listening');
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
else if (this.vendor === 'deepgram' && this._bufferedTranscripts.length > 0) {
|
||||||
|
this.logger.info({evt},
|
||||||
|
'TaskGather:_onTranscription - got empty transcript from deepgram, return the buffered transcripts');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (this.isContinuousAsr) {
|
if (this.isContinuousAsr) {
|
||||||
/* append the transcript and start listening again for asrTimeout */
|
/* append the transcript and start listening again for asrTimeout */
|
||||||
@@ -641,14 +676,14 @@ class TaskGather extends SttTask {
|
|||||||
this.logger.debug('TaskGather:_onTranscription - removing trailing punctuation');
|
this.logger.debug('TaskGather:_onTranscription - removing trailing punctuation');
|
||||||
evt.alternatives[0].transcript = t.slice(0, -1);
|
evt.alternatives[0].transcript = t.slice(0, -1);
|
||||||
}
|
}
|
||||||
else this.logger.debug({t}, 'TaskGather:_onTranscription - no trailing punctuation');
|
|
||||||
}
|
}
|
||||||
this.logger.info({evt}, 'TaskGather:_onTranscription - got transcript during continous asr');
|
this.logger.info({evt}, 'TaskGather:_onTranscription - got transcript during continous asr');
|
||||||
this._bufferedTranscripts.push(evt);
|
this._bufferedTranscripts.push(evt);
|
||||||
this._clearTimer();
|
this._clearTimer();
|
||||||
if (this._finalAsrTimer) {
|
if (this._finalAsrTimer) {
|
||||||
this._clearFinalAsrTimer();
|
this._clearFinalAsrTimer();
|
||||||
return this._resolve(this._bufferedTranscripts.length > 0 ? 'speech' : 'timeout');
|
const evt = this.consolidateTranscripts(this._bufferedTranscripts, 1, this.language);
|
||||||
|
return this._resolve(this._bufferedTranscripts.length > 0 ? 'speech' : 'timeout', evt);
|
||||||
}
|
}
|
||||||
this._startAsrTimer();
|
this._startAsrTimer();
|
||||||
|
|
||||||
@@ -670,16 +705,25 @@ class TaskGather extends SttTask {
|
|||||||
evt = this.compileSonioxTranscripts(this._sonioxTranscripts, 1, this.language);
|
evt = this.compileSonioxTranscripts(this._sonioxTranscripts, 1, this.language);
|
||||||
this._sonioxTranscripts = [];
|
this._sonioxTranscripts = [];
|
||||||
}
|
}
|
||||||
|
else if (this.vendor === 'deepgram') {
|
||||||
|
/* compile transcripts into one */
|
||||||
|
if (!emptyTranscript) this._bufferedTranscripts.push(evt);
|
||||||
|
if (this.data.recognizer?.deepgramOptions?.utteranceEndMs) {
|
||||||
|
this.logger.debug('TaskGather:_onTranscription - got speech_final waiting for UtteranceEnd event');
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
this.logger.debug({evt}, 'TaskGather:_onTranscription - compiling deepgram transcripts');
|
||||||
|
evt = this.consolidateTranscripts(this._bufferedTranscripts, 1, this.language);
|
||||||
|
this._bufferedTranscripts = [];
|
||||||
|
this.logger.debug({evt}, 'TaskGather:_onTranscription - compiled deepgram transcripts');
|
||||||
|
}
|
||||||
|
|
||||||
|
/* here is where we return a final transcript */
|
||||||
this._resolve('speech', evt);
|
this._resolve('speech', evt);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
/* google has a measure of stability:
|
|
||||||
https://cloud.google.com/speech-to-text/docs/basics#streaming_responses
|
|
||||||
others do not.
|
|
||||||
*/
|
|
||||||
//const isStableEnough = typeof evt.stability === 'undefined' || evt.stability > GATHER_STABILITY_THRESHOLD;
|
|
||||||
this._clearTimer();
|
this._clearTimer();
|
||||||
this._startTimer();
|
this._startTimer();
|
||||||
if (this.bargein && (words + bufferedWords) >= this.minBargeinWordCount) {
|
if (this.bargein && (words + bufferedWords) >= this.minBargeinWordCount) {
|
||||||
@@ -705,6 +749,14 @@ class TaskGather extends SttTask {
|
|||||||
this._sonioxTranscripts.push(evt.vendor.finalWords);
|
this._sonioxTranscripts.push(evt.vendor.finalWords);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
/* deepgram can send a non-final transcript but with words that are final, so we need to buffer */
|
||||||
|
if (this.vendor === 'deepgram') {
|
||||||
|
const originalEvent = evt.vendor.evt;
|
||||||
|
if (originalEvent.is_final && evt.alternatives[0].transcript !== '') {
|
||||||
|
this.logger.debug({evt}, 'Gather:_onTranscription - buffering a completed (partial) deepgram transcript');
|
||||||
|
this._bufferedTranscripts.push(evt);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
_onEndOfUtterance(cs, ep) {
|
_onEndOfUtterance(cs, ep) {
|
||||||
@@ -719,7 +771,7 @@ class TaskGather extends SttTask {
|
|||||||
* getting a transcription. This can happen if someone coughs or mumbles.
|
* getting a transcription. This can happen if someone coughs or mumbles.
|
||||||
* For that reason don't ask for a single utterance and we'll terminate the transcribe operation
|
* For that reason don't ask for a single utterance and we'll terminate the transcribe operation
|
||||||
* once we get a final transcript.
|
* once we get a final transcript.
|
||||||
* However, if the usr has specified a singleUtterance, then we need to restart here
|
* However, if the user has specified a singleUtterance, then we need to restart here
|
||||||
* since we dont have a final transcript yet.
|
* since we dont have a final transcript yet.
|
||||||
*/
|
*/
|
||||||
if (!this.resolved && !this.killed && !this._bufferedTranscripts.length && this.wantsSingleUtterance) {
|
if (!this.resolved && !this.killed && !this._bufferedTranscripts.length && this.wantsSingleUtterance) {
|
||||||
@@ -858,18 +910,6 @@ class TaskGather extends SttTask {
|
|||||||
this._clearTimer();
|
this._clearTimer();
|
||||||
this._clearFastRecognitionTimer();
|
this._clearFastRecognitionTimer();
|
||||||
|
|
||||||
if (this.isContinuousAsr && reason.startsWith('speech')) {
|
|
||||||
evt = {
|
|
||||||
is_final: true,
|
|
||||||
transcripts: this._bufferedTranscripts
|
|
||||||
};
|
|
||||||
this.logger.debug({evt}, 'TaskGather:resolve continuous asr');
|
|
||||||
}
|
|
||||||
else if (!this.isContinuousAsr && reason.startsWith('speech') && this._bufferedTranscripts.length) {
|
|
||||||
compileTranscripts(this.logger, evt, this._bufferedTranscripts);
|
|
||||||
this.logger.debug({evt}, 'TaskGather:resolve buffered results');
|
|
||||||
}
|
|
||||||
|
|
||||||
this.span.setAttributes({
|
this.span.setAttributes({
|
||||||
channel: 1,
|
channel: 1,
|
||||||
'stt.resolve': reason,
|
'stt.resolve': reason,
|
||||||
|
|||||||
@@ -16,12 +16,14 @@ class SttTask extends Task {
|
|||||||
normalizeTranscription,
|
normalizeTranscription,
|
||||||
removeSpeechListeners,
|
removeSpeechListeners,
|
||||||
setSpeechCredentialsAtRuntime,
|
setSpeechCredentialsAtRuntime,
|
||||||
compileSonioxTranscripts
|
compileSonioxTranscripts,
|
||||||
|
consolidateTranscripts
|
||||||
} = require('../utils/transcription-utils')(logger);
|
} = require('../utils/transcription-utils')(logger);
|
||||||
this.setChannelVarsForStt = setChannelVarsForStt;
|
this.setChannelVarsForStt = setChannelVarsForStt;
|
||||||
this.normalizeTranscription = normalizeTranscription;
|
this.normalizeTranscription = normalizeTranscription;
|
||||||
this.removeSpeechListeners = removeSpeechListeners;
|
this.removeSpeechListeners = removeSpeechListeners;
|
||||||
this.compileSonioxTranscripts = compileSonioxTranscripts;
|
this.compileSonioxTranscripts = compileSonioxTranscripts;
|
||||||
|
this.consolidateTranscripts = consolidateTranscripts;
|
||||||
|
|
||||||
this.isHandledByPrimaryProvider = true;
|
this.isHandledByPrimaryProvider = true;
|
||||||
if (this.data.recognizer) {
|
if (this.data.recognizer) {
|
||||||
@@ -138,6 +140,14 @@ class SttTask extends Task {
|
|||||||
addKey(key, evt.compiled_context, 3600 * 12)
|
addKey(key, evt.compiled_context, 3600 * 12)
|
||||||
.catch((err) => this.logger.info({err}, `Error caching cobalt context for ${key}`));
|
.catch((err) => this.logger.info({err}, `Error caching cobalt context for ${key}`));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
_doContinuousAsrWithDeepgram(asrTimeout) {
|
||||||
|
/* deepgram has an utterance_end_ms property that simplifies things */
|
||||||
|
assert(this.vendor === 'deepgram');
|
||||||
|
this.logger.debug(`_doContinuousAsrWithDeepgram - setting utterance_end_ms to ${asrTimeout}`);
|
||||||
|
const dgOptions = this.data.recognizer.deepgramOptions = this.data.recognizer.deepgramOptions || {};
|
||||||
|
dgOptions.utteranceEndMs = dgOptions.utteranceEndMs || asrTimeout;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
module.exports = SttTask;
|
module.exports = SttTask;
|
||||||
|
|||||||
@@ -34,7 +34,9 @@ class TaskTranscribe extends SttTask {
|
|||||||
|
|
||||||
// Continuos asr timeout
|
// Continuos asr timeout
|
||||||
this.asrTimeout = typeof this.data.recognizer.asrTimeout === 'number' ? this.data.recognizer.asrTimeout * 1000 : 0;
|
this.asrTimeout = typeof this.data.recognizer.asrTimeout === 'number' ? this.data.recognizer.asrTimeout * 1000 : 0;
|
||||||
this.isContinuousAsr = this.asrTimeout > 0;
|
if (this.asrTimeout > 0) {
|
||||||
|
this.isContinuousAsr = true;
|
||||||
|
}
|
||||||
/* buffer speech for continuous asr */
|
/* buffer speech for continuous asr */
|
||||||
this._bufferedTranscripts = [];
|
this._bufferedTranscripts = [];
|
||||||
}
|
}
|
||||||
@@ -177,6 +179,12 @@ class TaskTranscribe extends SttTask {
|
|||||||
async _setSpeechHandlers(cs, ep, channel) {
|
async _setSpeechHandlers(cs, ep, channel) {
|
||||||
if (this[`_speechHandlersSet_${channel}`]) return;
|
if (this[`_speechHandlersSet_${channel}`]) return;
|
||||||
this[`_speechHandlersSet_${channel}`] = true;
|
this[`_speechHandlersSet_${channel}`] = true;
|
||||||
|
|
||||||
|
/* some special deepgram logic */
|
||||||
|
if (this.vendor === 'deepgram') {
|
||||||
|
if (this.isContinuousAsr) this._doContinuousAsrWithDeepgram(this.asrTimeout);
|
||||||
|
}
|
||||||
|
|
||||||
const opts = this.setChannelVarsForStt(this, this.sttCredentials, this.data.recognizer);
|
const opts = this.setChannelVarsForStt(this, this.sttCredentials, this.data.recognizer);
|
||||||
switch (this.vendor) {
|
switch (this.vendor) {
|
||||||
case 'google':
|
case 'google':
|
||||||
@@ -223,6 +231,10 @@ class TaskTranscribe extends SttTask {
|
|||||||
this._onDeepgramConnect.bind(this, cs, ep, channel));
|
this._onDeepgramConnect.bind(this, cs, ep, channel));
|
||||||
ep.addCustomEventListener(DeepgramTranscriptionEvents.ConnectFailure,
|
ep.addCustomEventListener(DeepgramTranscriptionEvents.ConnectFailure,
|
||||||
this._onDeepGramConnectFailure.bind(this, cs, ep, channel));
|
this._onDeepGramConnectFailure.bind(this, cs, ep, channel));
|
||||||
|
|
||||||
|
/* if app sets deepgramOptions.utteranceEndMs they essentially want continuous asr */
|
||||||
|
if (opts.DEEPGRAM_SPEECH_UTTERANCE_END_MS) this.isContinuousAsr = true;
|
||||||
|
|
||||||
break;
|
break;
|
||||||
case 'soniox':
|
case 'soniox':
|
||||||
this.bugname = 'soniox_transcribe';
|
this.bugname = 'soniox_transcribe';
|
||||||
@@ -329,8 +341,20 @@ class TaskTranscribe extends SttTask {
|
|||||||
const bugname = fsEvent.getHeader('media-bugname');
|
const bugname = fsEvent.getHeader('media-bugname');
|
||||||
if (bugname && this.bugname !== bugname) return;
|
if (bugname && this.bugname !== bugname) return;
|
||||||
|
|
||||||
if (this.vendor === 'ibm') {
|
if (this.vendor === 'ibm' && evt?.state === 'listening') return;
|
||||||
if (evt?.state === 'listening') return;
|
|
||||||
|
if (this.vendor === 'deepgram' && evt.type === 'UtteranceEnd') {
|
||||||
|
/* we will only get this when we have set utterance_end_ms */
|
||||||
|
if (this._bufferedTranscripts.length === 0) {
|
||||||
|
this.logger.debug('Gather:_onTranscription - got UtteranceEnd event from deepgram but no buffered transcripts');
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
this.logger.debug('Gather:_onTranscription - got UtteranceEnd event from deepgram, return buffered transcript');
|
||||||
|
evt = this.consolidateTranscripts(this._bufferedTranscripts, 1, this.language);
|
||||||
|
this._bufferedTranscripts = [];
|
||||||
|
this._resolve('speech', evt);
|
||||||
|
}
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
this.logger.debug({evt}, 'TaskTranscribe:_onTranscription - before normalization');
|
this.logger.debug({evt}, 'TaskTranscribe:_onTranscription - before normalization');
|
||||||
|
|
||||||
@@ -369,17 +393,6 @@ class TaskTranscribe extends SttTask {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
_compileTranscripts() {
|
|
||||||
assert(this._bufferedTranscripts.length);
|
|
||||||
const evt = this._bufferedTranscripts[0];
|
|
||||||
let t = '';
|
|
||||||
for (const a of this._bufferedTranscripts) {
|
|
||||||
t += ` ${a.alternatives[0].transcript}`;
|
|
||||||
}
|
|
||||||
evt.alternatives[0].transcript = t.trim();
|
|
||||||
return evt;
|
|
||||||
}
|
|
||||||
|
|
||||||
async _resolve(channel, evt) {
|
async _resolve(channel, evt) {
|
||||||
/* we've got a transcript, so end the otel child span for this channel */
|
/* we've got a transcript, so end the otel child span for this channel */
|
||||||
if (this.childSpan[channel - 1] && this.childSpan[channel - 1].span) {
|
if (this.childSpan[channel - 1] && this.childSpan[channel - 1].span) {
|
||||||
@@ -577,11 +590,12 @@ class TaskTranscribe extends SttTask {
|
|||||||
}
|
}
|
||||||
|
|
||||||
_startAsrTimer(channel) {
|
_startAsrTimer(channel) {
|
||||||
|
if (this.vendor === 'deepgram') return; // no need
|
||||||
assert(this.isContinuousAsr);
|
assert(this.isContinuousAsr);
|
||||||
this._clearAsrTimer(channel);
|
this._clearAsrTimer(channel);
|
||||||
this._asrTimer = setTimeout(() => {
|
this._asrTimer = setTimeout(() => {
|
||||||
this.logger.debug(`TaskTranscribe:_startAsrTimer - asr timer went off for channel: ${channel}`);
|
this.logger.debug(`TaskTranscribe:_startAsrTimer - asr timer went off for channel: ${channel}`);
|
||||||
const evt = this._compileTranscripts();
|
const evt = this.consolidateTranscripts(this._bufferedTranscripts, channel, this.language);
|
||||||
this._bufferedTranscripts = [];
|
this._bufferedTranscripts = [];
|
||||||
this._resolve(channel, evt);
|
this._resolve(channel, evt);
|
||||||
}, this.asrTimeout);
|
}, this.asrTimeout);
|
||||||
|
|||||||
@@ -52,6 +52,7 @@ const stickyVars = {
|
|||||||
'DEEPGRAM_SPEECH_SEARCH',
|
'DEEPGRAM_SPEECH_SEARCH',
|
||||||
'DEEPGRAM_SPEECH_REPLACE',
|
'DEEPGRAM_SPEECH_REPLACE',
|
||||||
'DEEPGRAM_SPEECH_ENDPOINTING',
|
'DEEPGRAM_SPEECH_ENDPOINTING',
|
||||||
|
'DEEPGRAM_SPEECH_UTTERANCE_END_MS',
|
||||||
'DEEPGRAM_SPEECH_VAD_TURNOFF',
|
'DEEPGRAM_SPEECH_VAD_TURNOFF',
|
||||||
'DEEPGRAM_SPEECH_TAG'
|
'DEEPGRAM_SPEECH_TAG'
|
||||||
],
|
],
|
||||||
@@ -106,6 +107,53 @@ const stickyVars = {
|
|||||||
]
|
]
|
||||||
};
|
};
|
||||||
|
|
||||||
|
const consolidateTranscripts = (bufferedTranscripts, channel, language) => {
|
||||||
|
if (bufferedTranscripts.length === 1) return bufferedTranscripts[0];
|
||||||
|
let totalConfidence = 0;
|
||||||
|
const finalTranscript = bufferedTranscripts.reduce((acc, evt) => {
|
||||||
|
totalConfidence += evt.alternatives[0].confidence;
|
||||||
|
|
||||||
|
let newTranscript = evt.alternatives[0].transcript;
|
||||||
|
|
||||||
|
// If new transcript consists only of digits, spaces, and a trailing comma or period
|
||||||
|
if (newTranscript.match(/^[\d\s]+[,.]?$/)) {
|
||||||
|
newTranscript = newTranscript.replace(/\s/g, ''); // Remove all spaces
|
||||||
|
if (newTranscript.endsWith(',')) {
|
||||||
|
newTranscript = newTranscript.slice(0, -1); // Remove the trailing comma
|
||||||
|
} else if (newTranscript.endsWith('.')) {
|
||||||
|
newTranscript = newTranscript.slice(0, -1); // Remove the trailing period
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const lastChar = acc.alternatives[0].transcript.slice(-1);
|
||||||
|
const firstChar = newTranscript.charAt(0);
|
||||||
|
|
||||||
|
if (lastChar.match(/\d/) && firstChar.match(/\d/)) {
|
||||||
|
acc.alternatives[0].transcript += newTranscript;
|
||||||
|
} else {
|
||||||
|
acc.alternatives[0].transcript += ` ${newTranscript}`;
|
||||||
|
}
|
||||||
|
|
||||||
|
return acc;
|
||||||
|
}, {
|
||||||
|
language_code: language,
|
||||||
|
channel_tag: channel,
|
||||||
|
is_final: true,
|
||||||
|
alternatives: [{
|
||||||
|
transcript: ''
|
||||||
|
}]
|
||||||
|
});
|
||||||
|
finalTranscript.alternatives[0].confidence = bufferedTranscripts.length === 1 ?
|
||||||
|
bufferedTranscripts[0].alternatives[0].confidence :
|
||||||
|
totalConfidence / bufferedTranscripts.length;
|
||||||
|
finalTranscript.alternatives[0].transcript = finalTranscript.alternatives[0].transcript.trim();
|
||||||
|
finalTranscript.vendor = {
|
||||||
|
name: 'deepgram',
|
||||||
|
evt: bufferedTranscripts
|
||||||
|
};
|
||||||
|
return finalTranscript;
|
||||||
|
};
|
||||||
|
|
||||||
const compileSonioxTranscripts = (finalWordChunks, channel, language) => {
|
const compileSonioxTranscripts = (finalWordChunks, channel, language) => {
|
||||||
const words = finalWordChunks.flat();
|
const words = finalWordChunks.flat();
|
||||||
const transcript = words.reduce((acc, word) => {
|
const transcript = words.reduce((acc, word) => {
|
||||||
@@ -163,7 +211,7 @@ const normalizeSoniox = (evt, channel, language) => {
|
|||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
const normalizeDeepgram = (evt, channel, language) => {
|
const normalizeDeepgram = (evt, channel, language, shortUtterance) => {
|
||||||
const copy = JSON.parse(JSON.stringify(evt));
|
const copy = JSON.parse(JSON.stringify(evt));
|
||||||
const alternatives = (evt.channel?.alternatives || [])
|
const alternatives = (evt.channel?.alternatives || [])
|
||||||
.map((alt) => ({
|
.map((alt) => ({
|
||||||
@@ -171,10 +219,14 @@ const normalizeDeepgram = (evt, channel, language) => {
|
|||||||
transcript: alt.transcript,
|
transcript: alt.transcript,
|
||||||
}));
|
}));
|
||||||
|
|
||||||
|
/**
|
||||||
|
* note difference between is_final and speech_final in Deepgram:
|
||||||
|
* https://developers.deepgram.com/docs/understand-endpointing-interim-results
|
||||||
|
*/
|
||||||
return {
|
return {
|
||||||
language_code: language,
|
language_code: language,
|
||||||
channel_tag: channel,
|
channel_tag: channel,
|
||||||
is_final: evt.is_final,
|
is_final: shortUtterance ? evt.is_final : evt.speech_final,
|
||||||
alternatives: [alternatives[0]],
|
alternatives: [alternatives[0]],
|
||||||
vendor: {
|
vendor: {
|
||||||
name: 'deepgram',
|
name: 'deepgram',
|
||||||
@@ -325,12 +377,12 @@ const normalizeAws = (evt, channel, language) => {
|
|||||||
|
|
||||||
|
|
||||||
module.exports = (logger) => {
|
module.exports = (logger) => {
|
||||||
const normalizeTranscription = (evt, vendor, channel, language) => {
|
const normalizeTranscription = (evt, vendor, channel, language, shortUtterance) => {
|
||||||
|
|
||||||
//logger.debug({ evt, vendor, channel, language }, 'normalizeTranscription');
|
//logger.debug({ evt, vendor, channel, language }, 'normalizeTranscription');
|
||||||
switch (vendor) {
|
switch (vendor) {
|
||||||
case 'deepgram':
|
case 'deepgram':
|
||||||
return normalizeDeepgram(evt, channel, language);
|
return normalizeDeepgram(evt, channel, language, shortUtterance);
|
||||||
case 'microsoft':
|
case 'microsoft':
|
||||||
return normalizeMicrosoft(evt, channel, language);
|
return normalizeMicrosoft(evt, channel, language);
|
||||||
case 'google':
|
case 'google':
|
||||||
@@ -536,6 +588,8 @@ module.exports = (logger) => {
|
|||||||
{DEEPGRAM_SPEECH_KEYWORDS: deepgramOptions.keywords.join(',')},
|
{DEEPGRAM_SPEECH_KEYWORDS: deepgramOptions.keywords.join(',')},
|
||||||
...('endpointing' in deepgramOptions) &&
|
...('endpointing' in deepgramOptions) &&
|
||||||
{DEEPGRAM_SPEECH_ENDPOINTING: deepgramOptions.endpointing},
|
{DEEPGRAM_SPEECH_ENDPOINTING: deepgramOptions.endpointing},
|
||||||
|
...(deepgramOptions.utteranceEndMs) &&
|
||||||
|
{DEEPGRAM_SPEECH_UTTERANCE_END_MS: deepgramOptions.utteranceEndMs},
|
||||||
...(deepgramOptions.vadTurnoff) &&
|
...(deepgramOptions.vadTurnoff) &&
|
||||||
{DEEPGRAM_SPEECH_VAD_TURNOFF: deepgramOptions.vadTurnoff},
|
{DEEPGRAM_SPEECH_VAD_TURNOFF: deepgramOptions.vadTurnoff},
|
||||||
...(deepgramOptions.tag) &&
|
...(deepgramOptions.tag) &&
|
||||||
@@ -743,6 +797,7 @@ module.exports = (logger) => {
|
|||||||
setChannelVarsForStt,
|
setChannelVarsForStt,
|
||||||
removeSpeechListeners,
|
removeSpeechListeners,
|
||||||
setSpeechCredentialsAtRuntime,
|
setSpeechCredentialsAtRuntime,
|
||||||
compileSonioxTranscripts
|
compileSonioxTranscripts,
|
||||||
|
consolidateTranscripts
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|||||||
Reference in New Issue
Block a user