mirror of
https://github.com/jambonz/jambonz-feature-server.git
synced 2025-12-20 16:50:39 +00:00
Feature/deepgram stt (#190)
* initial changes to support deepgram stt * fixes for normalizing vendor-specific transcriptions * update to latest drachtio-fsmrf with support for deepgram stt * deepgram parsing error * hints support for deepgram * handling deepgram errors * ignore late arriving transcripts for deepgram * handling of empty transcripts * transcribe changes * allow deepgram stt credentials to be provided at run time * bind channel in transcription handler * fixes for transcribe when handling empty transcripts * more empty transcript fixes * update tests to latest modules * add test cases for deepgram speech recognition
This commit is contained in:
@@ -5,7 +5,8 @@ const {
|
||||
GoogleTranscriptionEvents,
|
||||
NuanceTranscriptionEvents,
|
||||
AwsTranscriptionEvents,
|
||||
AzureTranscriptionEvents
|
||||
AzureTranscriptionEvents,
|
||||
DeepgramTranscriptionEvents
|
||||
} = require('../utils/constants');
|
||||
|
||||
const makeTask = require('./make_task');
|
||||
@@ -54,11 +55,14 @@ class TaskGather extends Task {
|
||||
this.vendor = recognizer.vendor;
|
||||
this.language = recognizer.language;
|
||||
|
||||
/* let credentials be supplied in the recognizer object at runtime */
|
||||
if (recognizer.vendor === 'nuance') {
|
||||
const {clientId, secret} = recognizer.nuanceOptions;
|
||||
if (clientId && secret) {
|
||||
this.sttCredentials = {client_id: clientId, secret};
|
||||
}
|
||||
if (clientId && secret) this.sttCredentials = {client_id: clientId, secret};
|
||||
}
|
||||
else if (recognizer.vendor === 'deepgram') {
|
||||
const {apiKey} = recognizer.deepgramOptions;
|
||||
if (apiKey) this.sttCredentials = {api_key: apiKey};
|
||||
}
|
||||
|
||||
/* continuous ASR (i.e. compile transcripts until a special timeout or dtmf key) */
|
||||
@@ -338,8 +342,16 @@ class TaskGather extends Task {
|
||||
if ((this.sayTask || this.playTask) && this.listenDuringPrompt) {
|
||||
opts.NUANCE_STALL_TIMERS = 1;
|
||||
}
|
||||
|
||||
break;
|
||||
|
||||
case 'deepgram':
|
||||
this.bugname = 'deepgram_transcribe';
|
||||
ep.addCustomEventListener(DeepgramTranscriptionEvents.Transcription, this._onTranscription.bind(this, cs, ep));
|
||||
ep.addCustomEventListener(DeepgramTranscriptionEvents.Connect, this._onDeepgramConnect.bind(this, cs, ep));
|
||||
ep.addCustomEventListener(DeepgramTranscriptionEvents.ConnectFailure,
|
||||
this._onDeepGramConnectFailure.bind(this, cs, ep));
|
||||
break;
|
||||
|
||||
default:
|
||||
throw new Error(`Invalid vendor ${this.vendor}`);
|
||||
}
|
||||
@@ -441,30 +453,31 @@ class TaskGather extends Task {
|
||||
|
||||
_onTranscription(cs, ep, evt, fsEvent) {
|
||||
// make sure this is not a transcript from answering machine detection
|
||||
this.logger.debug({evt}, 'Gather:_onTranscription');
|
||||
const bugname = fsEvent.getHeader('media-bugname');
|
||||
const finished = fsEvent.getHeader('transcription-session-finished');
|
||||
this.logger.debug({evt, bugname, finished}, 'Gather:_onTranscription');
|
||||
if (bugname && this.bugname !== bugname) return;
|
||||
|
||||
evt = this.normalizeTranscription(evt, this.vendor, 1, this.language);
|
||||
|
||||
/* count words for bargein feature */
|
||||
const words = evt.alternatives[0].transcript.split(' ').length;
|
||||
const words = evt.alternatives[0]?.transcript.split(' ').length;
|
||||
const bufferedWords = this._bufferedTranscripts.reduce((count, e) => {
|
||||
return count + e.alternatives[0].transcript.split(' ').length;
|
||||
return count + e.alternatives[0]?.transcript.split(' ').length;
|
||||
}, 0);
|
||||
|
||||
if (evt.is_final) {
|
||||
if (evt.alternatives[0].transcript === '' && !this.callSession.callGone && !this.killed) {
|
||||
if ('microsoft' === this.vendor && finished === 'true') {
|
||||
if (finished === 'true' && ['microsoft', 'deepgram'].includes(this.vendor)) {
|
||||
this.logger.debug({evt}, 'TaskGather:_onTranscription - got empty transcript from old gather, disregarding');
|
||||
}
|
||||
else {
|
||||
this.logger.info({evt}, 'TaskGather:_onTranscription - got empty transcript, listen again');
|
||||
this._startTranscribing(ep);
|
||||
this.logger.info({evt}, 'TaskGather:_onTranscription - got empty transcript, continue listening');
|
||||
//this._startTranscribing(ep);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (this.isContinuousAsr) {
|
||||
/* append the transcript and start listening again for asrTimeout */
|
||||
const t = evt.alternatives[0].transcript;
|
||||
@@ -548,6 +561,23 @@ class TaskGather extends Task {
|
||||
return this._resolve('timeout');
|
||||
}
|
||||
}
|
||||
_onDeepgramConnect(_cs, _ep) {
|
||||
this.logger.debug('TaskGather:_onDeepgramConnect');
|
||||
}
|
||||
|
||||
_onDeepGramConnectFailure(cs, _ep, evt) {
|
||||
const {reason} = evt;
|
||||
const {writeAlerts, AlertType} = cs.srf.locals;
|
||||
this.logger.info({evt}, 'TaskGather:_onDeepgramConnectFailure');
|
||||
writeAlerts({
|
||||
account_sid: cs.accountSid,
|
||||
alert_type: AlertType.STT_FAILURE,
|
||||
message: `Failed connecting to Deepgram speech recognizer: ${reason}`,
|
||||
vendor: 'deepgram',
|
||||
}).catch((err) => this.logger.info({err}, 'Error generating alert for deepgram connection failure'));
|
||||
this.notifyError(`Failed connecting to speech vendor deepgram: ${reason}`);
|
||||
this.notifyTaskDone();
|
||||
}
|
||||
|
||||
_onVadDetected(cs, ep) {
|
||||
if (this.bargein && this.minBargeinWordCount === 0) {
|
||||
|
||||
@@ -445,7 +445,7 @@
|
||||
"properties": {
|
||||
"vendor": {
|
||||
"type": "string",
|
||||
"enum": ["google", "aws", "microsoft", "nuance", "default"]
|
||||
"enum": ["google", "aws", "microsoft", "nuance", "deepgram", "default"]
|
||||
},
|
||||
"language": "string",
|
||||
"vad": "#vad",
|
||||
@@ -510,12 +510,63 @@
|
||||
"azureSttEndpointId": "string",
|
||||
"asrDtmfTerminationDigit": "string",
|
||||
"asrTimeout": "number",
|
||||
"nuanceOptions": "#nuanceOptions"
|
||||
"nuanceOptions": "#nuanceOptions",
|
||||
"deepgramOptions": "#deepgramOptions"
|
||||
},
|
||||
"required": [
|
||||
"vendor"
|
||||
]
|
||||
},
|
||||
"deepgramOptions": {
|
||||
"properties": {
|
||||
"apiKey": "string",
|
||||
"tier": {
|
||||
"type": "string",
|
||||
"enum": [
|
||||
"enhanced",
|
||||
"base"
|
||||
]
|
||||
},
|
||||
"model": {
|
||||
"type": "string",
|
||||
"enum": [
|
||||
"general",
|
||||
"meeting",
|
||||
"phonecall",
|
||||
"voicemail",
|
||||
"finance",
|
||||
"conversationalai",
|
||||
"video",
|
||||
"custom"
|
||||
]
|
||||
},
|
||||
"customModel": "string",
|
||||
"version": "string",
|
||||
"punctuate": "boolean",
|
||||
"profanityFilter": "boolean",
|
||||
"redact": {
|
||||
"type": "string",
|
||||
"enum": [
|
||||
"pci",
|
||||
"numbers",
|
||||
"true",
|
||||
"ssn"
|
||||
]
|
||||
},
|
||||
"diarize": "boolean",
|
||||
"diarizeVersion": "string",
|
||||
"ner": "boolean",
|
||||
"multichannel": "boolean",
|
||||
"alternatives": "number",
|
||||
"numerals": "boolean",
|
||||
"search": "array",
|
||||
"replace": "array",
|
||||
"keywords": "array",
|
||||
"endpointing": "boolean",
|
||||
"vadTurnoff": "number",
|
||||
"tag": "string"
|
||||
}
|
||||
},
|
||||
"nuanceOptions": {
|
||||
"properties": {
|
||||
"clientId": "string",
|
||||
|
||||
@@ -5,7 +5,8 @@ const {
|
||||
GoogleTranscriptionEvents,
|
||||
AzureTranscriptionEvents,
|
||||
AwsTranscriptionEvents,
|
||||
NuanceTranscriptionEvents
|
||||
NuanceTranscriptionEvents,
|
||||
DeepgramTranscriptionEvents
|
||||
} = require('../utils/constants');
|
||||
const normalizeJambones = require('../utils/normalize-jambones');
|
||||
|
||||
@@ -15,9 +16,14 @@ class TaskTranscribe extends Task {
|
||||
this.preconditions = TaskPreconditions.Endpoint;
|
||||
this.parentTask = parentTask;
|
||||
|
||||
const {setChannelVarsForStt, normalizeTranscription} = require('../utils/transcription-utils')(logger);
|
||||
const {
|
||||
setChannelVarsForStt,
|
||||
normalizeTranscription,
|
||||
removeSpeechListeners
|
||||
} = require('../utils/transcription-utils')(logger);
|
||||
this.setChannelVarsForStt = setChannelVarsForStt;
|
||||
this.normalizeTranscription = normalizeTranscription;
|
||||
this.removeSpeechListeners = removeSpeechListeners;
|
||||
|
||||
this.transcriptionHook = this.data.transcriptionHook;
|
||||
this.earlyMedia = this.data.earlyMedia === true || (parentTask && parentTask.earlyMedia);
|
||||
@@ -28,12 +34,17 @@ class TaskTranscribe extends Task {
|
||||
this.interim = !!recognizer.interim;
|
||||
this.separateRecognitionPerChannel = recognizer.separateRecognitionPerChannel;
|
||||
|
||||
/* let credentials be supplied in the recognizer object at runtime */
|
||||
if (recognizer.vendor === 'nuance') {
|
||||
const {clientId, secret} = recognizer.nuanceOptions;
|
||||
if (clientId && secret) {
|
||||
this.sttCredentials = {client_id: clientId, secret};
|
||||
}
|
||||
}
|
||||
else if (recognizer.vendor === 'deepgram') {
|
||||
const {apiKey} = recognizer.deepgramOptions;
|
||||
if (apiKey) this.sttCredentials = {api_key: apiKey};
|
||||
}
|
||||
|
||||
recognizer.hints = recognizer.hints || [];
|
||||
recognizer.altLanguages = recognizer.altLanguages || [];
|
||||
@@ -69,7 +80,7 @@ class TaskTranscribe extends Task {
|
||||
if (!this.data.recognizer.vendor) {
|
||||
this.data.recognizer.vendor = this.vendor;
|
||||
}
|
||||
this.sttCredentials = cs.getSpeechCredentials(this.vendor, 'stt');
|
||||
if (!this.sttCredentials) this.sttCredentials = cs.getSpeechCredentials(this.vendor, 'stt');
|
||||
|
||||
try {
|
||||
if (!this.sttCredentials) {
|
||||
@@ -105,22 +116,7 @@ class TaskTranscribe extends Task {
|
||||
this.logger.info(err, 'TaskTranscribe:exec - error');
|
||||
this.parentTask && this.parentTask.emit('error', err);
|
||||
}
|
||||
ep.removeCustomEventListener(GoogleTranscriptionEvents.Transcription);
|
||||
ep.removeCustomEventListener(GoogleTranscriptionEvents.EndOfUtterance);
|
||||
ep.removeCustomEventListener(GoogleTranscriptionEvents.VadDetected);
|
||||
|
||||
ep.removeCustomEventListener(AwsTranscriptionEvents.Transcription);
|
||||
ep.removeCustomEventListener(AwsTranscriptionEvents.VadDetected);
|
||||
|
||||
ep.removeCustomEventListener(AzureTranscriptionEvents.Transcription);
|
||||
ep.removeCustomEventListener(AzureTranscriptionEvents.NoSpeechDetected);
|
||||
ep.removeCustomEventListener(AzureTranscriptionEvents.VadDetected);
|
||||
|
||||
ep.removeCustomEventListener(NuanceTranscriptionEvents.Transcription);
|
||||
ep.removeCustomEventListener(NuanceTranscriptionEvents.TranscriptionComplete);
|
||||
ep.removeCustomEventListener(NuanceTranscriptionEvents.StartOfSpeech);
|
||||
ep.removeCustomEventListener(NuanceTranscriptionEvents.Error);
|
||||
ep.removeCustomEventListener(NuanceTranscriptionEvents.VadDetected);
|
||||
this.removeSpeechListeners(ep);
|
||||
}
|
||||
|
||||
async kill(cs) {
|
||||
@@ -184,6 +180,15 @@ class TaskTranscribe extends Task {
|
||||
ep.addCustomEventListener(AzureTranscriptionEvents.Error,
|
||||
this._onNuanceError.bind(this, cs, ep, channel));
|
||||
break;
|
||||
case 'deepgram':
|
||||
this.bugname = 'deepgram_transcribe';
|
||||
ep.addCustomEventListener(DeepgramTranscriptionEvents.Transcription,
|
||||
this._onTranscription.bind(this, cs, ep, channel));
|
||||
ep.addCustomEventListener(DeepgramTranscriptionEvents.Connect,
|
||||
this._onDeepgramConnect.bind(this, cs, ep, channel));
|
||||
ep.addCustomEventListener(DeepgramTranscriptionEvents.ConnectFailure,
|
||||
this._onDeepGramConnectFailure.bind(this, cs, ep, channel));
|
||||
break;
|
||||
default:
|
||||
throw new Error(`Invalid vendor ${this.vendor}`);
|
||||
}
|
||||
@@ -215,9 +220,15 @@ class TaskTranscribe extends Task {
|
||||
|
||||
this.logger.debug({evt}, 'TaskTranscribe:_onTranscription');
|
||||
|
||||
if (evt.alternatives[0].transcript === '' && !cs.callGone && !this.killed) {
|
||||
this.logger.info({evt}, 'TaskGather:_onTranscription - got empty transcript, listen again');
|
||||
return this._transcribe(ep);
|
||||
if (evt.alternatives[0]?.transcript === '' && !cs.callGone && !this.killed) {
|
||||
if (['microsoft', 'deepgram'].includes(this.vendor)) {
|
||||
this.logger.info({evt}, 'TaskTranscribe:_onTranscription - got empty transcript, continue listening');
|
||||
}
|
||||
else {
|
||||
this.logger.info({evt}, 'TaskTranscribe:_onTranscription - got empty transcript, listen again');
|
||||
this._transcribe(ep);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (this.transcriptionHook) {
|
||||
@@ -268,6 +279,34 @@ class TaskTranscribe extends Task {
|
||||
this._timer = null;
|
||||
}
|
||||
}
|
||||
_onNuanceError(_cs, _ep, evt) {
|
||||
const {code, error, details} = evt;
|
||||
if (code === 404 && error === 'No speech') {
|
||||
this.logger.debug({code, error, details}, 'TaskTranscribe:_onNuanceError');
|
||||
return this._resolve('timeout');
|
||||
}
|
||||
this.logger.info({code, error, details}, 'TaskTranscribe:_onNuanceError');
|
||||
if (code === 413 && error === 'Too much speech') {
|
||||
return this._resolve('timeout');
|
||||
}
|
||||
}
|
||||
_onDeepgramConnect(_cs, _ep) {
|
||||
this.logger.debug('TaskTranscribe:_onDeepgramConnect');
|
||||
}
|
||||
|
||||
_onDeepGramConnectFailure(cs, _ep, evt) {
|
||||
const {reason} = evt;
|
||||
const {writeAlerts, AlertType} = cs.srf.locals;
|
||||
this.logger.info({evt}, 'TaskTranscribe:_onDeepgramConnectFailure');
|
||||
writeAlerts({
|
||||
account_sid: cs.accountSid,
|
||||
alert_type: AlertType.STT_FAILURE,
|
||||
message: `Failed connecting to Deepgram speech recognizer: ${reason}`,
|
||||
vendor: 'deepgram',
|
||||
}).catch((err) => this.logger.info({err}, 'Error generating alert for deepgram connection failure'));
|
||||
this.notifyError(`Failed connecting to speech vendor deepgram: ${reason}`);
|
||||
this.notifyTaskDone();
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = TaskTranscribe;
|
||||
|
||||
Reference in New Issue
Block a user