mirror of
https://github.com/jambonz/jambonz-feature-server.git
synced 2025-12-20 16:50:39 +00:00
Feature/azure recognition (#46)
* add support for microsoft speech recognition * update to drachtio-fsmrf that support microsoft stt * gather and transcribe now support microsoft
This commit is contained in:
@@ -3,7 +3,8 @@ const {
|
||||
TaskName,
|
||||
TaskPreconditions,
|
||||
GoogleTranscriptionEvents,
|
||||
AwsTranscriptionEvents
|
||||
AwsTranscriptionEvents,
|
||||
AzureTranscriptionEvents
|
||||
} = require('../utils/constants');
|
||||
|
||||
const makeTask = require('./make_task');
|
||||
@@ -33,6 +34,12 @@ class TaskGather extends Task {
|
||||
this.vocabularyName = recognizer.vocabularyName;
|
||||
this.vocabularyFilterName = recognizer.vocabularyFilterName;
|
||||
this.filterMethod = recognizer.filterMethod;
|
||||
|
||||
/* microsoft options */
|
||||
this.outputFormat = recognizer.outputFormat || 'simple';
|
||||
this.profanityOption = recognizer.profanityOption || 'raw';
|
||||
this.requestSnr = recognizer.requestSnr || false;
|
||||
this.initialSpeechTimeoutMs = recognizer.initialSpeechTimeoutMs || 0;
|
||||
}
|
||||
|
||||
this.digitBuffer = '';
|
||||
@@ -63,7 +70,7 @@ class TaskGather extends Task {
|
||||
this.sttCredentials = cs.getSpeechCredentials(this.vendor, 'stt');
|
||||
if (this.needsStt && !this.sttCredentials) {
|
||||
const {writeAlerts, AlertType} = cs.srf.locals;
|
||||
this.logger.info(`TaskGather:exec - ERROR stt using ${this.vendor} requested but not creds supplied`);
|
||||
this.logger.info(`TaskGather:exec - ERROR stt using ${this.vendor} requested but creds not supplied`);
|
||||
writeAlerts({
|
||||
account_sid: cs.accountSid,
|
||||
alert_type: AlertType.STT_NOT_PROVISIONED,
|
||||
@@ -106,6 +113,8 @@ class TaskGather extends Task {
|
||||
ep.removeCustomEventListener(GoogleTranscriptionEvents.Transcription);
|
||||
ep.removeCustomEventListener(GoogleTranscriptionEvents.EndOfUtterance);
|
||||
ep.removeCustomEventListener(AwsTranscriptionEvents.Transcription);
|
||||
ep.removeCustomEventListener(AzureTranscriptionEvents.Transcription);
|
||||
ep.removeCustomEventListener(AzureTranscriptionEvents.NoSpeechDetected);
|
||||
}
|
||||
|
||||
kill(cs) {
|
||||
@@ -135,7 +144,9 @@ class TaskGather extends Task {
|
||||
GOOGLE_SPEECH_SINGLE_UTTERANCE: true,
|
||||
GOOGLE_SPEECH_MODEL: 'command_and_search'
|
||||
});
|
||||
if (this.hints && this.hints.length > 1) opts.GOOGLE_SPEECH_HINTS = this.hints.join(',');
|
||||
if (this.hints && this.hints.length > 1) {
|
||||
opts.GOOGLE_SPEECH_HINTS = this.hints.map((h) => h.trim()).join(',');
|
||||
}
|
||||
if (this.altLanguages && this.altLanguages.length > 1) {
|
||||
opts.GOOGLE_SPEECH_ALTERNATIVE_LANGUAGE_CODES = this.altLanguages.join(',');
|
||||
}
|
||||
@@ -145,22 +156,41 @@ class TaskGather extends Task {
|
||||
ep.addCustomEventListener(GoogleTranscriptionEvents.Transcription, this._onTranscription.bind(this, cs, ep));
|
||||
ep.addCustomEventListener(GoogleTranscriptionEvents.EndOfUtterance, this._onEndOfUtterance.bind(this, cs, ep));
|
||||
}
|
||||
else {
|
||||
else if (['aws', 'polly'].includes(this.vendor)) {
|
||||
if (this.vocabularyName) opts.AWS_VOCABULARY_NAME = this.vocabularyName;
|
||||
if (this.vocabularyFilterName) {
|
||||
opts.AWS_VOCABULARY_NAME = this.vocabularyFilterName;
|
||||
opts.AWS_VOCABULARY_FILTER_METHOD = this.filterMethod || 'mask';
|
||||
}
|
||||
Object.assign(opts, {
|
||||
AWS_ACCESS_KEY_ID: this.sttCredentials.accessKeyId,
|
||||
AWS_SECRET_ACCESS_KEY: this.sttCredentials.secretAccessKey,
|
||||
AWS_REGION: this.sttCredentials.region
|
||||
});
|
||||
if (this.sttCredentials) {
|
||||
Object.assign(opts, {
|
||||
AWS_ACCESS_KEY_ID: this.sttCredentials.accessKeyId,
|
||||
AWS_SECRET_ACCESS_KEY: this.sttCredentials.secretAccessKey,
|
||||
AWS_REGION: this.sttCredentials.region
|
||||
});
|
||||
}
|
||||
ep.addCustomEventListener(AwsTranscriptionEvents.Transcription, this._onTranscription.bind(this, cs, ep));
|
||||
}
|
||||
else if ('microsoft' === this.vendor) {
|
||||
if (this.sttCredentials) {
|
||||
Object.assign(opts, {
|
||||
'AZURE_SUBSCRIPTION_KEY': this.sttCredentials.api_key,
|
||||
'AZURE_REGION': this.sttCredentials.region
|
||||
});
|
||||
}
|
||||
if (this.hints && this.hints.length > 1) {
|
||||
opts.AZURE_SPEECH_HINTS = this.hints.map((h) => h.trim()).join(',');
|
||||
}
|
||||
//if (this.requestSnr) opts.AZURE_REQUEST_SNR = 1;
|
||||
//if (this.profanityOption !== 'raw') opts.AZURE_PROFANITY_OPTION = this.profanityOption;
|
||||
if (this.initialSpeechTimeoutMs > 0) opts.AZURE_INITIAL_SPEECH_TIMEOUT_MS = this.initialSpeechTimeoutMs;
|
||||
opts.AZURE_USE_OUTPUT_FORMAT_DETAILED = 1;
|
||||
|
||||
ep.addCustomEventListener(AzureTranscriptionEvents.Transcription, this._onTranscription.bind(this, cs, ep));
|
||||
ep.addCustomEventListener(AzureTranscriptionEvents.NoSpeechDetected, this._onNoSpeechDetected.bind(this, cs, ep));
|
||||
}
|
||||
await ep.set(opts)
|
||||
.catch((err) => this.logger.info(err, 'Error setting channel variables'));
|
||||
|
||||
}
|
||||
|
||||
_startTranscribing(ep) {
|
||||
@@ -208,11 +238,21 @@ class TaskGather extends Task {
|
||||
|
||||
_onTranscription(cs, ep, evt) {
|
||||
if ('aws' === this.vendor && Array.isArray(evt) && evt.length > 0) evt = evt[0];
|
||||
this.logger.debug(evt, 'TaskGather:_onTranscription');
|
||||
const final = evt.is_final;
|
||||
if (final) {
|
||||
this._resolve('speech', evt);
|
||||
if ('microsoft' === this.vendor) {
|
||||
const nbest = evt.NBest;
|
||||
const newEvent = {
|
||||
is_final: evt.RecognitionStatus === 'Success',
|
||||
alternatives: [
|
||||
{
|
||||
confidence: nbest[0].Confidence,
|
||||
transcript: nbest[0].Display
|
||||
}
|
||||
]
|
||||
};
|
||||
evt = newEvent;
|
||||
}
|
||||
this.logger.debug(evt, 'TaskGather:_onTranscription');
|
||||
if (evt.is_final) this._resolve('speech', evt);
|
||||
else if (this.partialResultHook) {
|
||||
this.cs.requestor.request(this.partialResultHook, Object.assign({speech: evt}, this.cs.callInfo))
|
||||
.catch((err) => this.logger.info(err, 'GatherTask:_onTranscription error'));
|
||||
@@ -225,6 +265,10 @@ class TaskGather extends Task {
|
||||
}
|
||||
}
|
||||
|
||||
_onNoSpeechDetected(cs, ep) {
|
||||
this._resolve('timeout');
|
||||
}
|
||||
|
||||
async _resolve(reason, evt) {
|
||||
if (this.resolved) return;
|
||||
this.resolved = true;
|
||||
|
||||
@@ -348,7 +348,7 @@
|
||||
"properties": {
|
||||
"vendor": {
|
||||
"type": "string",
|
||||
"enum": ["google", "aws", "polly", "default"]
|
||||
"enum": ["google", "aws", "polly", "microsoft", "default"]
|
||||
},
|
||||
"language": "string",
|
||||
"voice": "string",
|
||||
@@ -365,7 +365,7 @@
|
||||
"properties": {
|
||||
"vendor": {
|
||||
"type": "string",
|
||||
"enum": ["google", "aws", "default"]
|
||||
"enum": ["google", "aws", "microsoft", "default"]
|
||||
},
|
||||
"language": "string",
|
||||
"hints": "array",
|
||||
@@ -405,7 +405,24 @@
|
||||
"mask",
|
||||
"tag"
|
||||
]
|
||||
}
|
||||
},
|
||||
"outputFormat": {
|
||||
"type": "string",
|
||||
"enum": [
|
||||
"simple",
|
||||
"detailed"
|
||||
]
|
||||
},
|
||||
"profanityOption": {
|
||||
"type": "string",
|
||||
"enum": [
|
||||
"masked",
|
||||
"removed",
|
||||
"raw"
|
||||
]
|
||||
},
|
||||
"requestSnr": "boolean",
|
||||
"initialSpeechTimeoutMs": "number"
|
||||
},
|
||||
"required": [
|
||||
"vendor"
|
||||
|
||||
@@ -3,6 +3,7 @@ const {
|
||||
TaskName,
|
||||
TaskPreconditions,
|
||||
GoogleTranscriptionEvents,
|
||||
AzureTranscriptionEvents,
|
||||
AwsTranscriptionEvents
|
||||
} = require('../utils/constants');
|
||||
|
||||
@@ -38,6 +39,12 @@ class TaskTranscribe extends Task {
|
||||
this.vocabularyName = recognizer.vocabularyName;
|
||||
this.vocabularyFilterName = recognizer.vocabularyFilterName;
|
||||
this.filterMethod = recognizer.filterMethod;
|
||||
|
||||
/* microsoft options */
|
||||
this.outputFormat = recognizer.outputFormat || 'simple';
|
||||
this.profanityOption = recognizer.profanityOption || 'raw';
|
||||
this.requestSnr = recognizer.requestSnr || false;
|
||||
this.initialSpeechTimeoutMs = recognizer.initialSpeechTimeoutMs || 0;
|
||||
}
|
||||
|
||||
get name() { return TaskName.Transcribe; }
|
||||
@@ -53,7 +60,13 @@ class TaskTranscribe extends Task {
|
||||
|
||||
try {
|
||||
if (!this.sttCredentials) {
|
||||
// TODO: generate alert (actually should be done by cs.getSpeechCredentials)
|
||||
const {writeAlerts, AlertType} = cs.srf.locals;
|
||||
this.logger.info(`TaskTranscribe:exec - ERROR stt using ${this.vendor} requested but creds not supplied`);
|
||||
writeAlerts({
|
||||
account_sid: cs.accountSid,
|
||||
alert_type: AlertType.STT_NOT_PROVISIONED,
|
||||
vendor: this.vendor
|
||||
}).catch((err) => this.logger.info({err}, 'Error generating alert for no stt'));
|
||||
throw new Error('no provisioned speech credentials for TTS');
|
||||
}
|
||||
await this._startTranscribing(cs, ep);
|
||||
@@ -70,6 +83,8 @@ class TaskTranscribe extends Task {
|
||||
ep.removeCustomEventListener(AwsTranscriptionEvents.Transcription);
|
||||
ep.removeCustomEventListener(AwsTranscriptionEvents.NoAudioDetected);
|
||||
ep.removeCustomEventListener(AwsTranscriptionEvents.MaxDurationExceeded);
|
||||
ep.removeCustomEventListener(AzureTranscriptionEvents.Transcription);
|
||||
ep.removeCustomEventListener(AzureTranscriptionEvents.NoSpeechDetected);
|
||||
}
|
||||
|
||||
async kill(cs) {
|
||||
@@ -96,6 +111,8 @@ class TaskTranscribe extends Task {
|
||||
ep.addCustomEventListener(AwsTranscriptionEvents.NoAudioDetected, this._onNoAudio.bind(this, cs, ep));
|
||||
ep.addCustomEventListener(AwsTranscriptionEvents.MaxDurationExceeded,
|
||||
this._onMaxDurationExceeded.bind(this, cs, ep));
|
||||
ep.addCustomEventListener(AzureTranscriptionEvents.Transcription, this._onTranscription.bind(this, cs, ep));
|
||||
ep.addCustomEventListener(AzureTranscriptionEvents.NoSpeechDetected, this._onNoAudio.bind(this, cs, ep));
|
||||
|
||||
if (this.vendor === 'google') {
|
||||
if (this.sttCredentials) opts.GOOGLE_APPLICATION_CREDENTIALS = JSON.stringify(this.sttCredentials.credentials);
|
||||
@@ -164,6 +181,22 @@ class TaskTranscribe extends Task {
|
||||
await ep.set(opts)
|
||||
.catch((err) => this.logger.info(err, 'TaskTranscribe:_startTranscribing with aws'));
|
||||
}
|
||||
else if (this.vendor === 'microsoft') {
|
||||
Object.assign(opts, {
|
||||
'AZURE_SUBSCRIPTION_KEY': this.sttCredentials.api_key,
|
||||
'AZURE_REGION': this.sttCredentials.region
|
||||
});
|
||||
if (this.hints && this.hints.length > 1) {
|
||||
opts.AZURE_SPEECH_HINTS = this.hints.map((h) => h.trim()).join(',');
|
||||
}
|
||||
if (this.requestSnr) opts.AZURE_REQUEST_SNR = 1;
|
||||
if (this.profanityOption !== 'raw') opts.AZURE_PROFANITY_OPTION = this.profanityOption;
|
||||
if (this.initialSpeechTimeoutMs > 0) opts.AZURE_INITIAL_SPEECH_TIMEOUT_MS = this.initialSpeechTimeoutMs;
|
||||
if (this.outputFormat !== 'simple') opts.AZURE_USE_OUTPUT_FORMAT_DETAILED = 1;
|
||||
|
||||
await ep.set(opts)
|
||||
.catch((err) => this.logger.info(err, 'TaskTranscribe:_startTranscribing with azure'));
|
||||
}
|
||||
await this._transcribe(ep);
|
||||
}
|
||||
|
||||
@@ -178,6 +211,20 @@ class TaskTranscribe extends Task {
|
||||
|
||||
_onTranscription(cs, ep, evt) {
|
||||
if ('aws' === this.vendor && Array.isArray(evt) && evt.length > 0) evt = evt[0];
|
||||
if ('microsoft' === this.vendor) {
|
||||
const nbest = evt.NBest;
|
||||
const alternatives = nbest.map((n) => {
|
||||
return {
|
||||
confidence: n.Confidence,
|
||||
transcript: n.Display
|
||||
};
|
||||
});
|
||||
const newEvent = {
|
||||
is_final: evt.RecognitionStatus === 'Success',
|
||||
alternatives
|
||||
};
|
||||
evt = newEvent;
|
||||
}
|
||||
this.logger.debug(evt, 'TaskTranscribe:_onTranscription');
|
||||
|
||||
this.cs.requestor.request(this.transcriptionHook, Object.assign({speech: evt}, this.cs.callInfo))
|
||||
|
||||
@@ -65,6 +65,12 @@
|
||||
"NoAudioDetected": "aws_transcribe::no_audio_detected",
|
||||
"MaxDurationExceeded": "aws_transcribe::max_duration_exceeded"
|
||||
},
|
||||
"AzureTranscriptionEvents": {
|
||||
"Transcription": "azure_transcribe::transcription",
|
||||
"StartOfUtterance": "azure_transcribe::start_of_utterance",
|
||||
"EndOfUtterance": "azure_transcribe::end_of_utterance",
|
||||
"NoSpeechDetected": "azure_transcribe::no_speech_detected"
|
||||
},
|
||||
"ListenEvents": {
|
||||
"Connect": "mod_audio_fork::connect",
|
||||
"ConnectFailure": "mod_audio_fork::connect_failed",
|
||||
|
||||
Reference in New Issue
Block a user