Feature/azure recognition (#46)

* add support for microsoft speech recognition

* update to drachtio-fsmrf that support microsoft stt

* gather and transcribe now support microsoft
This commit is contained in:
Dave Horton
2021-11-26 16:40:25 -06:00
committed by GitHub
parent fe1778e9ae
commit 1e93973419
6 changed files with 180 additions and 66 deletions

View File

@@ -3,7 +3,8 @@ const {
TaskName,
TaskPreconditions,
GoogleTranscriptionEvents,
AwsTranscriptionEvents
AwsTranscriptionEvents,
AzureTranscriptionEvents
} = require('../utils/constants');
const makeTask = require('./make_task');
@@ -33,6 +34,12 @@ class TaskGather extends Task {
this.vocabularyName = recognizer.vocabularyName;
this.vocabularyFilterName = recognizer.vocabularyFilterName;
this.filterMethod = recognizer.filterMethod;
/* microsoft options */
this.outputFormat = recognizer.outputFormat || 'simple';
this.profanityOption = recognizer.profanityOption || 'raw';
this.requestSnr = recognizer.requestSnr || false;
this.initialSpeechTimeoutMs = recognizer.initialSpeechTimeoutMs || 0;
}
this.digitBuffer = '';
@@ -63,7 +70,7 @@ class TaskGather extends Task {
this.sttCredentials = cs.getSpeechCredentials(this.vendor, 'stt');
if (this.needsStt && !this.sttCredentials) {
const {writeAlerts, AlertType} = cs.srf.locals;
this.logger.info(`TaskGather:exec - ERROR stt using ${this.vendor} requested but not creds supplied`);
this.logger.info(`TaskGather:exec - ERROR stt using ${this.vendor} requested but creds not supplied`);
writeAlerts({
account_sid: cs.accountSid,
alert_type: AlertType.STT_NOT_PROVISIONED,
@@ -106,6 +113,8 @@ class TaskGather extends Task {
ep.removeCustomEventListener(GoogleTranscriptionEvents.Transcription);
ep.removeCustomEventListener(GoogleTranscriptionEvents.EndOfUtterance);
ep.removeCustomEventListener(AwsTranscriptionEvents.Transcription);
ep.removeCustomEventListener(AzureTranscriptionEvents.Transcription);
ep.removeCustomEventListener(AzureTranscriptionEvents.NoSpeechDetected);
}
kill(cs) {
@@ -135,7 +144,9 @@ class TaskGather extends Task {
GOOGLE_SPEECH_SINGLE_UTTERANCE: true,
GOOGLE_SPEECH_MODEL: 'command_and_search'
});
if (this.hints && this.hints.length > 1) opts.GOOGLE_SPEECH_HINTS = this.hints.join(',');
if (this.hints && this.hints.length > 1) {
opts.GOOGLE_SPEECH_HINTS = this.hints.map((h) => h.trim()).join(',');
}
if (this.altLanguages && this.altLanguages.length > 1) {
opts.GOOGLE_SPEECH_ALTERNATIVE_LANGUAGE_CODES = this.altLanguages.join(',');
}
@@ -145,22 +156,41 @@ class TaskGather extends Task {
ep.addCustomEventListener(GoogleTranscriptionEvents.Transcription, this._onTranscription.bind(this, cs, ep));
ep.addCustomEventListener(GoogleTranscriptionEvents.EndOfUtterance, this._onEndOfUtterance.bind(this, cs, ep));
}
else {
else if (['aws', 'polly'].includes(this.vendor)) {
if (this.vocabularyName) opts.AWS_VOCABULARY_NAME = this.vocabularyName;
if (this.vocabularyFilterName) {
opts.AWS_VOCABULARY_NAME = this.vocabularyFilterName;
opts.AWS_VOCABULARY_FILTER_METHOD = this.filterMethod || 'mask';
}
Object.assign(opts, {
AWS_ACCESS_KEY_ID: this.sttCredentials.accessKeyId,
AWS_SECRET_ACCESS_KEY: this.sttCredentials.secretAccessKey,
AWS_REGION: this.sttCredentials.region
});
if (this.sttCredentials) {
Object.assign(opts, {
AWS_ACCESS_KEY_ID: this.sttCredentials.accessKeyId,
AWS_SECRET_ACCESS_KEY: this.sttCredentials.secretAccessKey,
AWS_REGION: this.sttCredentials.region
});
}
ep.addCustomEventListener(AwsTranscriptionEvents.Transcription, this._onTranscription.bind(this, cs, ep));
}
else if ('microsoft' === this.vendor) {
if (this.sttCredentials) {
Object.assign(opts, {
'AZURE_SUBSCRIPTION_KEY': this.sttCredentials.api_key,
'AZURE_REGION': this.sttCredentials.region
});
}
if (this.hints && this.hints.length > 1) {
opts.AZURE_SPEECH_HINTS = this.hints.map((h) => h.trim()).join(',');
}
//if (this.requestSnr) opts.AZURE_REQUEST_SNR = 1;
//if (this.profanityOption !== 'raw') opts.AZURE_PROFANITY_OPTION = this.profanityOption;
if (this.initialSpeechTimeoutMs > 0) opts.AZURE_INITIAL_SPEECH_TIMEOUT_MS = this.initialSpeechTimeoutMs;
opts.AZURE_USE_OUTPUT_FORMAT_DETAILED = 1;
ep.addCustomEventListener(AzureTranscriptionEvents.Transcription, this._onTranscription.bind(this, cs, ep));
ep.addCustomEventListener(AzureTranscriptionEvents.NoSpeechDetected, this._onNoSpeechDetected.bind(this, cs, ep));
}
await ep.set(opts)
.catch((err) => this.logger.info(err, 'Error setting channel variables'));
}
_startTranscribing(ep) {
@@ -208,11 +238,21 @@ class TaskGather extends Task {
_onTranscription(cs, ep, evt) {
if ('aws' === this.vendor && Array.isArray(evt) && evt.length > 0) evt = evt[0];
this.logger.debug(evt, 'TaskGather:_onTranscription');
const final = evt.is_final;
if (final) {
this._resolve('speech', evt);
if ('microsoft' === this.vendor) {
const nbest = evt.NBest;
const newEvent = {
is_final: evt.RecognitionStatus === 'Success',
alternatives: [
{
confidence: nbest[0].Confidence,
transcript: nbest[0].Display
}
]
};
evt = newEvent;
}
this.logger.debug(evt, 'TaskGather:_onTranscription');
if (evt.is_final) this._resolve('speech', evt);
else if (this.partialResultHook) {
this.cs.requestor.request(this.partialResultHook, Object.assign({speech: evt}, this.cs.callInfo))
.catch((err) => this.logger.info(err, 'GatherTask:_onTranscription error'));
@@ -225,6 +265,10 @@ class TaskGather extends Task {
}
}
_onNoSpeechDetected(cs, ep) {
this._resolve('timeout');
}
async _resolve(reason, evt) {
if (this.resolved) return;
this.resolved = true;

View File

@@ -348,7 +348,7 @@
"properties": {
"vendor": {
"type": "string",
"enum": ["google", "aws", "polly", "default"]
"enum": ["google", "aws", "polly", "microsoft", "default"]
},
"language": "string",
"voice": "string",
@@ -365,7 +365,7 @@
"properties": {
"vendor": {
"type": "string",
"enum": ["google", "aws", "default"]
"enum": ["google", "aws", "microsoft", "default"]
},
"language": "string",
"hints": "array",
@@ -405,7 +405,24 @@
"mask",
"tag"
]
}
},
"outputFormat": {
"type": "string",
"enum": [
"simple",
"detailed"
]
},
"profanityOption": {
"type": "string",
"enum": [
"masked",
"removed",
"raw"
]
},
"requestSnr": "boolean",
"initialSpeechTimeoutMs": "number"
},
"required": [
"vendor"

View File

@@ -3,6 +3,7 @@ const {
TaskName,
TaskPreconditions,
GoogleTranscriptionEvents,
AzureTranscriptionEvents,
AwsTranscriptionEvents
} = require('../utils/constants');
@@ -38,6 +39,12 @@ class TaskTranscribe extends Task {
this.vocabularyName = recognizer.vocabularyName;
this.vocabularyFilterName = recognizer.vocabularyFilterName;
this.filterMethod = recognizer.filterMethod;
/* microsoft options */
this.outputFormat = recognizer.outputFormat || 'simple';
this.profanityOption = recognizer.profanityOption || 'raw';
this.requestSnr = recognizer.requestSnr || false;
this.initialSpeechTimeoutMs = recognizer.initialSpeechTimeoutMs || 0;
}
get name() { return TaskName.Transcribe; }
@@ -53,7 +60,13 @@ class TaskTranscribe extends Task {
try {
if (!this.sttCredentials) {
// TODO: generate alert (actually should be done by cs.getSpeechCredentials)
const {writeAlerts, AlertType} = cs.srf.locals;
this.logger.info(`TaskTranscribe:exec - ERROR stt using ${this.vendor} requested but creds not supplied`);
writeAlerts({
account_sid: cs.accountSid,
alert_type: AlertType.STT_NOT_PROVISIONED,
vendor: this.vendor
}).catch((err) => this.logger.info({err}, 'Error generating alert for no stt'));
throw new Error('no provisioned speech credentials for TTS');
}
await this._startTranscribing(cs, ep);
@@ -70,6 +83,8 @@ class TaskTranscribe extends Task {
ep.removeCustomEventListener(AwsTranscriptionEvents.Transcription);
ep.removeCustomEventListener(AwsTranscriptionEvents.NoAudioDetected);
ep.removeCustomEventListener(AwsTranscriptionEvents.MaxDurationExceeded);
ep.removeCustomEventListener(AzureTranscriptionEvents.Transcription);
ep.removeCustomEventListener(AzureTranscriptionEvents.NoSpeechDetected);
}
async kill(cs) {
@@ -96,6 +111,8 @@ class TaskTranscribe extends Task {
ep.addCustomEventListener(AwsTranscriptionEvents.NoAudioDetected, this._onNoAudio.bind(this, cs, ep));
ep.addCustomEventListener(AwsTranscriptionEvents.MaxDurationExceeded,
this._onMaxDurationExceeded.bind(this, cs, ep));
ep.addCustomEventListener(AzureTranscriptionEvents.Transcription, this._onTranscription.bind(this, cs, ep));
ep.addCustomEventListener(AzureTranscriptionEvents.NoSpeechDetected, this._onNoAudio.bind(this, cs, ep));
if (this.vendor === 'google') {
if (this.sttCredentials) opts.GOOGLE_APPLICATION_CREDENTIALS = JSON.stringify(this.sttCredentials.credentials);
@@ -164,6 +181,22 @@ class TaskTranscribe extends Task {
await ep.set(opts)
.catch((err) => this.logger.info(err, 'TaskTranscribe:_startTranscribing with aws'));
}
else if (this.vendor === 'microsoft') {
Object.assign(opts, {
'AZURE_SUBSCRIPTION_KEY': this.sttCredentials.api_key,
'AZURE_REGION': this.sttCredentials.region
});
if (this.hints && this.hints.length > 1) {
opts.AZURE_SPEECH_HINTS = this.hints.map((h) => h.trim()).join(',');
}
if (this.requestSnr) opts.AZURE_REQUEST_SNR = 1;
if (this.profanityOption !== 'raw') opts.AZURE_PROFANITY_OPTION = this.profanityOption;
if (this.initialSpeechTimeoutMs > 0) opts.AZURE_INITIAL_SPEECH_TIMEOUT_MS = this.initialSpeechTimeoutMs;
if (this.outputFormat !== 'simple') opts.AZURE_USE_OUTPUT_FORMAT_DETAILED = 1;
await ep.set(opts)
.catch((err) => this.logger.info(err, 'TaskTranscribe:_startTranscribing with azure'));
}
await this._transcribe(ep);
}
@@ -178,6 +211,20 @@ class TaskTranscribe extends Task {
_onTranscription(cs, ep, evt) {
if ('aws' === this.vendor && Array.isArray(evt) && evt.length > 0) evt = evt[0];
if ('microsoft' === this.vendor) {
const nbest = evt.NBest;
const alternatives = nbest.map((n) => {
return {
confidence: n.Confidence,
transcript: n.Display
};
});
const newEvent = {
is_final: evt.RecognitionStatus === 'Success',
alternatives
};
evt = newEvent;
}
this.logger.debug(evt, 'TaskTranscribe:_onTranscription');
this.cs.requestor.request(this.transcriptionHook, Object.assign({speech: evt}, this.cs.callInfo))