Feature/azure recognition (#46)

* add support for microsoft speech recognition

* update to drachtio-fsmrf that support microsoft stt

* gather and transcribe now support microsoft
This commit is contained in:
Dave Horton
2021-11-26 16:40:25 -06:00
committed by GitHub
parent fe1778e9ae
commit 1e93973419
6 changed files with 180 additions and 66 deletions

View File

@@ -3,6 +3,7 @@ const {
TaskName,
TaskPreconditions,
GoogleTranscriptionEvents,
AzureTranscriptionEvents,
AwsTranscriptionEvents
} = require('../utils/constants');
@@ -38,6 +39,12 @@ class TaskTranscribe extends Task {
this.vocabularyName = recognizer.vocabularyName;
this.vocabularyFilterName = recognizer.vocabularyFilterName;
this.filterMethod = recognizer.filterMethod;
/* microsoft options */
this.outputFormat = recognizer.outputFormat || 'simple';
this.profanityOption = recognizer.profanityOption || 'raw';
this.requestSnr = recognizer.requestSnr || false;
this.initialSpeechTimeoutMs = recognizer.initialSpeechTimeoutMs || 0;
}
get name() { return TaskName.Transcribe; }
@@ -53,7 +60,13 @@ class TaskTranscribe extends Task {
try {
if (!this.sttCredentials) {
// TODO: generate alert (actually should be done by cs.getSpeechCredentials)
const {writeAlerts, AlertType} = cs.srf.locals;
this.logger.info(`TaskTranscribe:exec - ERROR stt using ${this.vendor} requested but creds not supplied`);
writeAlerts({
account_sid: cs.accountSid,
alert_type: AlertType.STT_NOT_PROVISIONED,
vendor: this.vendor
}).catch((err) => this.logger.info({err}, 'Error generating alert for no stt'));
throw new Error('no provisioned speech credentials for TTS');
}
await this._startTranscribing(cs, ep);
@@ -70,6 +83,8 @@ class TaskTranscribe extends Task {
ep.removeCustomEventListener(AwsTranscriptionEvents.Transcription);
ep.removeCustomEventListener(AwsTranscriptionEvents.NoAudioDetected);
ep.removeCustomEventListener(AwsTranscriptionEvents.MaxDurationExceeded);
ep.removeCustomEventListener(AzureTranscriptionEvents.Transcription);
ep.removeCustomEventListener(AzureTranscriptionEvents.NoSpeechDetected);
}
async kill(cs) {
@@ -96,6 +111,8 @@ class TaskTranscribe extends Task {
ep.addCustomEventListener(AwsTranscriptionEvents.NoAudioDetected, this._onNoAudio.bind(this, cs, ep));
ep.addCustomEventListener(AwsTranscriptionEvents.MaxDurationExceeded,
this._onMaxDurationExceeded.bind(this, cs, ep));
ep.addCustomEventListener(AzureTranscriptionEvents.Transcription, this._onTranscription.bind(this, cs, ep));
ep.addCustomEventListener(AzureTranscriptionEvents.NoSpeechDetected, this._onNoAudio.bind(this, cs, ep));
if (this.vendor === 'google') {
if (this.sttCredentials) opts.GOOGLE_APPLICATION_CREDENTIALS = JSON.stringify(this.sttCredentials.credentials);
@@ -164,6 +181,22 @@ class TaskTranscribe extends Task {
await ep.set(opts)
.catch((err) => this.logger.info(err, 'TaskTranscribe:_startTranscribing with aws'));
}
else if (this.vendor === 'microsoft') {
Object.assign(opts, {
'AZURE_SUBSCRIPTION_KEY': this.sttCredentials.api_key,
'AZURE_REGION': this.sttCredentials.region
});
if (this.hints && this.hints.length > 1) {
opts.AZURE_SPEECH_HINTS = this.hints.map((h) => h.trim()).join(',');
}
if (this.requestSnr) opts.AZURE_REQUEST_SNR = 1;
if (this.profanityOption !== 'raw') opts.AZURE_PROFANITY_OPTION = this.profanityOption;
if (this.initialSpeechTimeoutMs > 0) opts.AZURE_INITIAL_SPEECH_TIMEOUT_MS = this.initialSpeechTimeoutMs;
if (this.outputFormat !== 'simple') opts.AZURE_USE_OUTPUT_FORMAT_DETAILED = 1;
await ep.set(opts)
.catch((err) => this.logger.info(err, 'TaskTranscribe:_startTranscribing with azure'));
}
await this._transcribe(ep);
}
@@ -178,6 +211,20 @@ class TaskTranscribe extends Task {
_onTranscription(cs, ep, evt) {
if ('aws' === this.vendor && Array.isArray(evt) && evt.length > 0) evt = evt[0];
if ('microsoft' === this.vendor) {
const nbest = evt.NBest;
const alternatives = nbest.map((n) => {
return {
confidence: n.Confidence,
transcript: n.Display
};
});
const newEvent = {
is_final: evt.RecognitionStatus === 'Success',
alternatives
};
evt = newEvent;
}
this.logger.debug(evt, 'TaskTranscribe:_onTranscription');
this.cs.requestor.request(this.transcriptionHook, Object.assign({speech: evt}, this.cs.callInfo))