Feature/deepgram stt (#190)

* initial changes to support deepgram stt

* fixes for normalizing vendor-specific transcriptions

* update to latest drachtio-fsmrf with support for deepgram stt

* deepgram parsing error

* hints support for deepgram

* handling deepgram errors

* ignore late arriving transcripts for deepgram

* handling of empty transcripts

* transcribe changes

* allow deepgram stt credentials to be provided at run time

* bind channel in transcription handler

* fixes for transcribe when handling empty transcripts

* more empty transcript fixes

* update tests to latest modules

* add test cases for deepgram speech recognition
This commit is contained in:
Dave Horton
2022-11-12 19:48:59 -05:00
committed by GitHub
parent f511e6ab6b
commit 8686348454
12 changed files with 2148 additions and 152 deletions

View File

@@ -5,7 +5,8 @@ const {
GoogleTranscriptionEvents,
NuanceTranscriptionEvents,
AwsTranscriptionEvents,
AzureTranscriptionEvents
AzureTranscriptionEvents,
DeepgramTranscriptionEvents
} = require('../utils/constants');
const makeTask = require('./make_task');
@@ -54,11 +55,14 @@ class TaskGather extends Task {
this.vendor = recognizer.vendor;
this.language = recognizer.language;
/* let credentials be supplied in the recognizer object at runtime */
if (recognizer.vendor === 'nuance') {
const {clientId, secret} = recognizer.nuanceOptions;
if (clientId && secret) {
this.sttCredentials = {client_id: clientId, secret};
}
if (clientId && secret) this.sttCredentials = {client_id: clientId, secret};
}
else if (recognizer.vendor === 'deepgram') {
const {apiKey} = recognizer.deepgramOptions;
if (apiKey) this.sttCredentials = {api_key: apiKey};
}
/* continuous ASR (i.e. compile transcripts until a special timeout or dtmf key) */
@@ -338,8 +342,16 @@ class TaskGather extends Task {
if ((this.sayTask || this.playTask) && this.listenDuringPrompt) {
opts.NUANCE_STALL_TIMERS = 1;
}
break;
case 'deepgram':
this.bugname = 'deepgram_transcribe';
ep.addCustomEventListener(DeepgramTranscriptionEvents.Transcription, this._onTranscription.bind(this, cs, ep));
ep.addCustomEventListener(DeepgramTranscriptionEvents.Connect, this._onDeepgramConnect.bind(this, cs, ep));
ep.addCustomEventListener(DeepgramTranscriptionEvents.ConnectFailure,
this._onDeepGramConnectFailure.bind(this, cs, ep));
break;
default:
throw new Error(`Invalid vendor ${this.vendor}`);
}
@@ -441,30 +453,31 @@ class TaskGather extends Task {
_onTranscription(cs, ep, evt, fsEvent) {
// make sure this is not a transcript from answering machine detection
this.logger.debug({evt}, 'Gather:_onTranscription');
const bugname = fsEvent.getHeader('media-bugname');
const finished = fsEvent.getHeader('transcription-session-finished');
this.logger.debug({evt, bugname, finished}, 'Gather:_onTranscription');
if (bugname && this.bugname !== bugname) return;
evt = this.normalizeTranscription(evt, this.vendor, 1, this.language);
/* count words for bargein feature */
const words = evt.alternatives[0].transcript.split(' ').length;
const words = evt.alternatives[0]?.transcript.split(' ').length;
const bufferedWords = this._bufferedTranscripts.reduce((count, e) => {
return count + e.alternatives[0].transcript.split(' ').length;
return count + e.alternatives[0]?.transcript.split(' ').length;
}, 0);
if (evt.is_final) {
if (evt.alternatives[0].transcript === '' && !this.callSession.callGone && !this.killed) {
if ('microsoft' === this.vendor && finished === 'true') {
if (finished === 'true' && ['microsoft', 'deepgram'].includes(this.vendor)) {
this.logger.debug({evt}, 'TaskGather:_onTranscription - got empty transcript from old gather, disregarding');
}
else {
this.logger.info({evt}, 'TaskGather:_onTranscription - got empty transcript, listen again');
this._startTranscribing(ep);
this.logger.info({evt}, 'TaskGather:_onTranscription - got empty transcript, continue listening');
//this._startTranscribing(ep);
}
return;
}
if (this.isContinuousAsr) {
/* append the transcript and start listening again for asrTimeout */
const t = evt.alternatives[0].transcript;
@@ -548,6 +561,23 @@ class TaskGather extends Task {
return this._resolve('timeout');
}
}
_onDeepgramConnect(_cs, _ep) {
this.logger.debug('TaskGather:_onDeepgramConnect');
}
_onDeepGramConnectFailure(cs, _ep, evt) {
const {reason} = evt;
const {writeAlerts, AlertType} = cs.srf.locals;
this.logger.info({evt}, 'TaskGather:_onDeepgramConnectFailure');
writeAlerts({
account_sid: cs.accountSid,
alert_type: AlertType.STT_FAILURE,
message: `Failed connecting to Deepgram speech recognizer: ${reason}`,
vendor: 'deepgram',
}).catch((err) => this.logger.info({err}, 'Error generating alert for deepgram connection failure'));
this.notifyError(`Failed connecting to speech vendor deepgram: ${reason}`);
this.notifyTaskDone();
}
_onVadDetected(cs, ep) {
if (this.bargein && this.minBargeinWordCount === 0) {

View File

@@ -445,7 +445,7 @@
"properties": {
"vendor": {
"type": "string",
"enum": ["google", "aws", "microsoft", "nuance", "default"]
"enum": ["google", "aws", "microsoft", "nuance", "deepgram", "default"]
},
"language": "string",
"vad": "#vad",
@@ -510,12 +510,63 @@
"azureSttEndpointId": "string",
"asrDtmfTerminationDigit": "string",
"asrTimeout": "number",
"nuanceOptions": "#nuanceOptions"
"nuanceOptions": "#nuanceOptions",
"deepgramOptions": "#deepgramOptions"
},
"required": [
"vendor"
]
},
"deepgramOptions": {
"properties": {
"apiKey": "string",
"tier": {
"type": "string",
"enum": [
"enhanced",
"base"
]
},
"model": {
"type": "string",
"enum": [
"general",
"meeting",
"phonecall",
"voicemail",
"finance",
"conversationalai",
"video",
"custom"
]
},
"customModel": "string",
"version": "string",
"punctuate": "boolean",
"profanityFilter": "boolean",
"redact": {
"type": "string",
"enum": [
"pci",
"numbers",
"true",
"ssn"
]
},
"diarize": "boolean",
"diarizeVersion": "string",
"ner": "boolean",
"multichannel": "boolean",
"alternatives": "number",
"numerals": "boolean",
"search": "array",
"replace": "array",
"keywords": "array",
"endpointing": "boolean",
"vadTurnoff": "number",
"tag": "string"
}
},
"nuanceOptions": {
"properties": {
"clientId": "string",

View File

@@ -5,7 +5,8 @@ const {
GoogleTranscriptionEvents,
AzureTranscriptionEvents,
AwsTranscriptionEvents,
NuanceTranscriptionEvents
NuanceTranscriptionEvents,
DeepgramTranscriptionEvents
} = require('../utils/constants');
const normalizeJambones = require('../utils/normalize-jambones');
@@ -15,9 +16,14 @@ class TaskTranscribe extends Task {
this.preconditions = TaskPreconditions.Endpoint;
this.parentTask = parentTask;
const {setChannelVarsForStt, normalizeTranscription} = require('../utils/transcription-utils')(logger);
const {
setChannelVarsForStt,
normalizeTranscription,
removeSpeechListeners
} = require('../utils/transcription-utils')(logger);
this.setChannelVarsForStt = setChannelVarsForStt;
this.normalizeTranscription = normalizeTranscription;
this.removeSpeechListeners = removeSpeechListeners;
this.transcriptionHook = this.data.transcriptionHook;
this.earlyMedia = this.data.earlyMedia === true || (parentTask && parentTask.earlyMedia);
@@ -28,12 +34,17 @@ class TaskTranscribe extends Task {
this.interim = !!recognizer.interim;
this.separateRecognitionPerChannel = recognizer.separateRecognitionPerChannel;
/* let credentials be supplied in the recognizer object at runtime */
if (recognizer.vendor === 'nuance') {
const {clientId, secret} = recognizer.nuanceOptions;
if (clientId && secret) {
this.sttCredentials = {client_id: clientId, secret};
}
}
else if (recognizer.vendor === 'deepgram') {
const {apiKey} = recognizer.deepgramOptions;
if (apiKey) this.sttCredentials = {api_key: apiKey};
}
recognizer.hints = recognizer.hints || [];
recognizer.altLanguages = recognizer.altLanguages || [];
@@ -69,7 +80,7 @@ class TaskTranscribe extends Task {
if (!this.data.recognizer.vendor) {
this.data.recognizer.vendor = this.vendor;
}
this.sttCredentials = cs.getSpeechCredentials(this.vendor, 'stt');
if (!this.sttCredentials) this.sttCredentials = cs.getSpeechCredentials(this.vendor, 'stt');
try {
if (!this.sttCredentials) {
@@ -105,22 +116,7 @@ class TaskTranscribe extends Task {
this.logger.info(err, 'TaskTranscribe:exec - error');
this.parentTask && this.parentTask.emit('error', err);
}
ep.removeCustomEventListener(GoogleTranscriptionEvents.Transcription);
ep.removeCustomEventListener(GoogleTranscriptionEvents.EndOfUtterance);
ep.removeCustomEventListener(GoogleTranscriptionEvents.VadDetected);
ep.removeCustomEventListener(AwsTranscriptionEvents.Transcription);
ep.removeCustomEventListener(AwsTranscriptionEvents.VadDetected);
ep.removeCustomEventListener(AzureTranscriptionEvents.Transcription);
ep.removeCustomEventListener(AzureTranscriptionEvents.NoSpeechDetected);
ep.removeCustomEventListener(AzureTranscriptionEvents.VadDetected);
ep.removeCustomEventListener(NuanceTranscriptionEvents.Transcription);
ep.removeCustomEventListener(NuanceTranscriptionEvents.TranscriptionComplete);
ep.removeCustomEventListener(NuanceTranscriptionEvents.StartOfSpeech);
ep.removeCustomEventListener(NuanceTranscriptionEvents.Error);
ep.removeCustomEventListener(NuanceTranscriptionEvents.VadDetected);
this.removeSpeechListeners(ep);
}
async kill(cs) {
@@ -184,6 +180,15 @@ class TaskTranscribe extends Task {
ep.addCustomEventListener(AzureTranscriptionEvents.Error,
this._onNuanceError.bind(this, cs, ep, channel));
break;
case 'deepgram':
this.bugname = 'deepgram_transcribe';
ep.addCustomEventListener(DeepgramTranscriptionEvents.Transcription,
this._onTranscription.bind(this, cs, ep, channel));
ep.addCustomEventListener(DeepgramTranscriptionEvents.Connect,
this._onDeepgramConnect.bind(this, cs, ep, channel));
ep.addCustomEventListener(DeepgramTranscriptionEvents.ConnectFailure,
this._onDeepGramConnectFailure.bind(this, cs, ep, channel));
break;
default:
throw new Error(`Invalid vendor ${this.vendor}`);
}
@@ -215,9 +220,15 @@ class TaskTranscribe extends Task {
this.logger.debug({evt}, 'TaskTranscribe:_onTranscription');
if (evt.alternatives[0].transcript === '' && !cs.callGone && !this.killed) {
this.logger.info({evt}, 'TaskGather:_onTranscription - got empty transcript, listen again');
return this._transcribe(ep);
if (evt.alternatives[0]?.transcript === '' && !cs.callGone && !this.killed) {
if (['microsoft', 'deepgram'].includes(this.vendor)) {
this.logger.info({evt}, 'TaskTranscribe:_onTranscription - got empty transcript, continue listening');
}
else {
this.logger.info({evt}, 'TaskTranscribe:_onTranscription - got empty transcript, listen again');
this._transcribe(ep);
}
return;
}
if (this.transcriptionHook) {
@@ -268,6 +279,34 @@ class TaskTranscribe extends Task {
this._timer = null;
}
}
_onNuanceError(_cs, _ep, evt) {
const {code, error, details} = evt;
if (code === 404 && error === 'No speech') {
this.logger.debug({code, error, details}, 'TaskTranscribe:_onNuanceError');
return this._resolve('timeout');
}
this.logger.info({code, error, details}, 'TaskTranscribe:_onNuanceError');
if (code === 413 && error === 'Too much speech') {
return this._resolve('timeout');
}
}
_onDeepgramConnect(_cs, _ep) {
this.logger.debug('TaskTranscribe:_onDeepgramConnect');
}
_onDeepGramConnectFailure(cs, _ep, evt) {
const {reason} = evt;
const {writeAlerts, AlertType} = cs.srf.locals;
this.logger.info({evt}, 'TaskTranscribe:_onDeepgramConnectFailure');
writeAlerts({
account_sid: cs.accountSid,
alert_type: AlertType.STT_FAILURE,
message: `Failed connecting to Deepgram speech recognizer: ${reason}`,
vendor: 'deepgram',
}).catch((err) => this.logger.info({err}, 'Error generating alert for deepgram connection failure'));
this.notifyError(`Failed connecting to speech vendor deepgram: ${reason}`);
this.notifyTaskDone();
}
}
module.exports = TaskTranscribe;