mirror of
https://github.com/jambonz/jambonz-feature-server.git
synced 2025-12-20 16:50:39 +00:00
Feature/deepgram stt (#190)
* initial changes to support deepgram stt * fixes for normalizing vendor-specific transcriptions * update to latest drachtio-fsmrf with support for deepgram stt * deepgram parsing error * hints support for deepgram * handling deepgram errors * ignore late arriving transcripts for deepgram * handling of empty transcripts * transcribe changes * allow deepgram stt credentials to be provided at run time * bind channel in transcription handler * fixes for transcribe when handling empty transcripts * more empty transcript fixes * update tests to latest modules * add test cases for deepgram speech recognition
This commit is contained in:
@@ -5,7 +5,8 @@ const {
|
||||
GoogleTranscriptionEvents,
|
||||
AzureTranscriptionEvents,
|
||||
AwsTranscriptionEvents,
|
||||
NuanceTranscriptionEvents
|
||||
NuanceTranscriptionEvents,
|
||||
DeepgramTranscriptionEvents
|
||||
} = require('../utils/constants');
|
||||
const normalizeJambones = require('../utils/normalize-jambones');
|
||||
|
||||
@@ -15,9 +16,14 @@ class TaskTranscribe extends Task {
|
||||
this.preconditions = TaskPreconditions.Endpoint;
|
||||
this.parentTask = parentTask;
|
||||
|
||||
const {setChannelVarsForStt, normalizeTranscription} = require('../utils/transcription-utils')(logger);
|
||||
const {
|
||||
setChannelVarsForStt,
|
||||
normalizeTranscription,
|
||||
removeSpeechListeners
|
||||
} = require('../utils/transcription-utils')(logger);
|
||||
this.setChannelVarsForStt = setChannelVarsForStt;
|
||||
this.normalizeTranscription = normalizeTranscription;
|
||||
this.removeSpeechListeners = removeSpeechListeners;
|
||||
|
||||
this.transcriptionHook = this.data.transcriptionHook;
|
||||
this.earlyMedia = this.data.earlyMedia === true || (parentTask && parentTask.earlyMedia);
|
||||
@@ -28,12 +34,17 @@ class TaskTranscribe extends Task {
|
||||
this.interim = !!recognizer.interim;
|
||||
this.separateRecognitionPerChannel = recognizer.separateRecognitionPerChannel;
|
||||
|
||||
/* let credentials be supplied in the recognizer object at runtime */
|
||||
if (recognizer.vendor === 'nuance') {
|
||||
const {clientId, secret} = recognizer.nuanceOptions;
|
||||
if (clientId && secret) {
|
||||
this.sttCredentials = {client_id: clientId, secret};
|
||||
}
|
||||
}
|
||||
else if (recognizer.vendor === 'deepgram') {
|
||||
const {apiKey} = recognizer.deepgramOptions;
|
||||
if (apiKey) this.sttCredentials = {api_key: apiKey};
|
||||
}
|
||||
|
||||
recognizer.hints = recognizer.hints || [];
|
||||
recognizer.altLanguages = recognizer.altLanguages || [];
|
||||
@@ -69,7 +80,7 @@ class TaskTranscribe extends Task {
|
||||
if (!this.data.recognizer.vendor) {
|
||||
this.data.recognizer.vendor = this.vendor;
|
||||
}
|
||||
this.sttCredentials = cs.getSpeechCredentials(this.vendor, 'stt');
|
||||
if (!this.sttCredentials) this.sttCredentials = cs.getSpeechCredentials(this.vendor, 'stt');
|
||||
|
||||
try {
|
||||
if (!this.sttCredentials) {
|
||||
@@ -105,22 +116,7 @@ class TaskTranscribe extends Task {
|
||||
this.logger.info(err, 'TaskTranscribe:exec - error');
|
||||
this.parentTask && this.parentTask.emit('error', err);
|
||||
}
|
||||
ep.removeCustomEventListener(GoogleTranscriptionEvents.Transcription);
|
||||
ep.removeCustomEventListener(GoogleTranscriptionEvents.EndOfUtterance);
|
||||
ep.removeCustomEventListener(GoogleTranscriptionEvents.VadDetected);
|
||||
|
||||
ep.removeCustomEventListener(AwsTranscriptionEvents.Transcription);
|
||||
ep.removeCustomEventListener(AwsTranscriptionEvents.VadDetected);
|
||||
|
||||
ep.removeCustomEventListener(AzureTranscriptionEvents.Transcription);
|
||||
ep.removeCustomEventListener(AzureTranscriptionEvents.NoSpeechDetected);
|
||||
ep.removeCustomEventListener(AzureTranscriptionEvents.VadDetected);
|
||||
|
||||
ep.removeCustomEventListener(NuanceTranscriptionEvents.Transcription);
|
||||
ep.removeCustomEventListener(NuanceTranscriptionEvents.TranscriptionComplete);
|
||||
ep.removeCustomEventListener(NuanceTranscriptionEvents.StartOfSpeech);
|
||||
ep.removeCustomEventListener(NuanceTranscriptionEvents.Error);
|
||||
ep.removeCustomEventListener(NuanceTranscriptionEvents.VadDetected);
|
||||
this.removeSpeechListeners(ep);
|
||||
}
|
||||
|
||||
async kill(cs) {
|
||||
@@ -184,6 +180,15 @@ class TaskTranscribe extends Task {
|
||||
ep.addCustomEventListener(AzureTranscriptionEvents.Error,
|
||||
this._onNuanceError.bind(this, cs, ep, channel));
|
||||
break;
|
||||
case 'deepgram':
|
||||
this.bugname = 'deepgram_transcribe';
|
||||
ep.addCustomEventListener(DeepgramTranscriptionEvents.Transcription,
|
||||
this._onTranscription.bind(this, cs, ep, channel));
|
||||
ep.addCustomEventListener(DeepgramTranscriptionEvents.Connect,
|
||||
this._onDeepgramConnect.bind(this, cs, ep, channel));
|
||||
ep.addCustomEventListener(DeepgramTranscriptionEvents.ConnectFailure,
|
||||
this._onDeepGramConnectFailure.bind(this, cs, ep, channel));
|
||||
break;
|
||||
default:
|
||||
throw new Error(`Invalid vendor ${this.vendor}`);
|
||||
}
|
||||
@@ -215,9 +220,15 @@ class TaskTranscribe extends Task {
|
||||
|
||||
this.logger.debug({evt}, 'TaskTranscribe:_onTranscription');
|
||||
|
||||
if (evt.alternatives[0].transcript === '' && !cs.callGone && !this.killed) {
|
||||
this.logger.info({evt}, 'TaskGather:_onTranscription - got empty transcript, listen again');
|
||||
return this._transcribe(ep);
|
||||
if (evt.alternatives[0]?.transcript === '' && !cs.callGone && !this.killed) {
|
||||
if (['microsoft', 'deepgram'].includes(this.vendor)) {
|
||||
this.logger.info({evt}, 'TaskTranscribe:_onTranscription - got empty transcript, continue listening');
|
||||
}
|
||||
else {
|
||||
this.logger.info({evt}, 'TaskTranscribe:_onTranscription - got empty transcript, listen again');
|
||||
this._transcribe(ep);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (this.transcriptionHook) {
|
||||
@@ -268,6 +279,34 @@ class TaskTranscribe extends Task {
|
||||
this._timer = null;
|
||||
}
|
||||
}
|
||||
_onNuanceError(_cs, _ep, evt) {
|
||||
const {code, error, details} = evt;
|
||||
if (code === 404 && error === 'No speech') {
|
||||
this.logger.debug({code, error, details}, 'TaskTranscribe:_onNuanceError');
|
||||
return this._resolve('timeout');
|
||||
}
|
||||
this.logger.info({code, error, details}, 'TaskTranscribe:_onNuanceError');
|
||||
if (code === 413 && error === 'Too much speech') {
|
||||
return this._resolve('timeout');
|
||||
}
|
||||
}
|
||||
_onDeepgramConnect(_cs, _ep) {
|
||||
this.logger.debug('TaskTranscribe:_onDeepgramConnect');
|
||||
}
|
||||
|
||||
_onDeepGramConnectFailure(cs, _ep, evt) {
|
||||
const {reason} = evt;
|
||||
const {writeAlerts, AlertType} = cs.srf.locals;
|
||||
this.logger.info({evt}, 'TaskTranscribe:_onDeepgramConnectFailure');
|
||||
writeAlerts({
|
||||
account_sid: cs.accountSid,
|
||||
alert_type: AlertType.STT_FAILURE,
|
||||
message: `Failed connecting to Deepgram speech recognizer: ${reason}`,
|
||||
vendor: 'deepgram',
|
||||
}).catch((err) => this.logger.info({err}, 'Error generating alert for deepgram connection failure'));
|
||||
this.notifyError(`Failed connecting to speech vendor deepgram: ${reason}`);
|
||||
this.notifyTaskDone();
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = TaskTranscribe;
|
||||
|
||||
Reference in New Issue
Block a user