mirror of
https://github.com/jambonz/jambonz-feature-server.git
synced 2025-12-20 16:50:39 +00:00
update transcribe to support google v1p1beta1 and aws
This commit is contained in:
@@ -47,6 +47,7 @@ class Dialogflow extends Task {
|
||||
this.language = this.data.tts.language || 'default';
|
||||
this.voice = this.data.tts.voice || 'default';
|
||||
}
|
||||
this.bargein = this.data.bargein;
|
||||
}
|
||||
|
||||
get name() { return TaskName.Dialogflow; }
|
||||
@@ -266,7 +267,7 @@ class Dialogflow extends Task {
|
||||
* @param {*} ep - media server endpoint
|
||||
* @param {*} evt - event data
|
||||
*/
|
||||
_onTranscription(ep, cs, evt) {
|
||||
async _onTranscription(ep, cs, evt) {
|
||||
const transcription = new Transcription(this.logger, evt);
|
||||
|
||||
if (this.events.includes('transcription') && transcription.isFinal) {
|
||||
@@ -281,6 +282,13 @@ class Dialogflow extends Task {
|
||||
transcription.confidence > 0.8) {
|
||||
ep.play(this.data.thinkingSound).catch((err) => this.logger.info(err, 'Error playing typing sound'));
|
||||
}
|
||||
|
||||
// interrupt playback on speaking if bargein = true
|
||||
if (this.bargein && this.playInProgress) {
|
||||
this.logger.debug('terminating playback due to speech bargein');
|
||||
this.playInProgress = false;
|
||||
await ep.api('uuid_break', ep.uuid);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
@@ -10,15 +10,18 @@ class TaskGather extends Task {
|
||||
|
||||
[
|
||||
'finishOnKey', 'hints', 'input', 'numDigits',
|
||||
'partialResultHook', 'profanityFilter',
|
||||
'partialResultHook',
|
||||
'speechTimeout', 'timeout', 'say', 'play'
|
||||
].forEach((k) => this[k] = this.data[k]);
|
||||
|
||||
this.timeout = (this.timeout || 5) * 1000;
|
||||
this.interim = this.partialResultCallback;
|
||||
if (this.data.recognizer) {
|
||||
this.language = this.data.recognizer.language || 'en-US';
|
||||
this.vendor = this.data.recognizer.vendor;
|
||||
const recognizer = this.data.recognizer;
|
||||
this.language = recognizer.language;
|
||||
if (recognizer.hints && recognizer.hints.length > 0) {
|
||||
this.hints = recognizer.hints.join(',');
|
||||
}
|
||||
}
|
||||
|
||||
this.digitBuffer = '';
|
||||
|
||||
@@ -134,7 +134,8 @@
|
||||
"noInputEvent": "string",
|
||||
"passDtmfAsTextInput": "boolean",
|
||||
"thinkingMusic": "string",
|
||||
"tts": "#synthesizer"
|
||||
"tts": "#synthesizer",
|
||||
"bargein": "boolean"
|
||||
},
|
||||
"required": [
|
||||
"project",
|
||||
@@ -271,7 +272,8 @@
|
||||
"earlyMedia": "boolean"
|
||||
},
|
||||
"required": [
|
||||
"transcriptionHook"
|
||||
"transcriptionHook",
|
||||
"recognizer"
|
||||
]
|
||||
},
|
||||
"target": {
|
||||
@@ -327,13 +329,47 @@
|
||||
"properties": {
|
||||
"vendor": {
|
||||
"type": "string",
|
||||
"enum": ["google"]
|
||||
"enum": ["google", "aws"]
|
||||
},
|
||||
"language": "string",
|
||||
"hints": "array",
|
||||
"altLanguages": "array",
|
||||
"profanityFilter": "boolean",
|
||||
"interim": "boolean",
|
||||
"dualChannel": "boolean"
|
||||
"singleUtterance": "boolean",
|
||||
"dualChannel": "boolean",
|
||||
"separateRecognitionPerChannel": "boolean",
|
||||
"punctuation": "boolean",
|
||||
"enhancedModel": "boolean",
|
||||
"words": "boolean",
|
||||
"diarization": "boolean",
|
||||
"diarizationMinSpeakers": "number",
|
||||
"diarizationMaxSpeakers": "number",
|
||||
"interactionType": {
|
||||
"type": "string",
|
||||
"enum": [
|
||||
"unspecified",
|
||||
"discussion",
|
||||
"presentation",
|
||||
"phone_call",
|
||||
"voicemail",
|
||||
"voice_search",
|
||||
"voice_command",
|
||||
"dictation"
|
||||
]
|
||||
},
|
||||
"naicsCode": "number",
|
||||
"identifyChannels": "boolean",
|
||||
"vocabularyName": "string",
|
||||
"vocabularyFilterName": "string",
|
||||
"filterMethod": {
|
||||
"type": "string",
|
||||
"enum": [
|
||||
"remove",
|
||||
"mask",
|
||||
"tag"
|
||||
]
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"vendor"
|
||||
|
||||
@@ -1,5 +1,10 @@
|
||||
const Task = require('./task');
|
||||
const {TaskName, TaskPreconditions, TranscriptionEvents} = require('../utils/constants');
|
||||
const {
|
||||
TaskName,
|
||||
TaskPreconditions,
|
||||
GoogleTranscriptionEvents,
|
||||
AwsTranscriptionEvents
|
||||
} = require('../utils/constants');
|
||||
|
||||
class TaskTranscribe extends Task {
|
||||
constructor(logger, opts, parentTask) {
|
||||
@@ -8,12 +13,33 @@ class TaskTranscribe extends Task {
|
||||
|
||||
this.transcriptionHook = this.data.transcriptionHook;
|
||||
this.earlyMedia = this.data.earlyMedia === true || (parentTask && parentTask.earlyMedia);
|
||||
if (this.data.recognizer) {
|
||||
this.language = this.data.recognizer.language || 'en-US';
|
||||
this.vendor = this.data.recognizer.vendor;
|
||||
this.interim = this.data.recognizer.interim === true;
|
||||
this.dualChannel = this.data.recognizer.dualChannel === true;
|
||||
}
|
||||
|
||||
const recognizer = this.data.recognizer;
|
||||
this.vendor = recognizer.vendor;
|
||||
if ('default' === this.vendor || !this.vendor) this.vendor = this.callSession.speechRecognizerVendor
|
||||
this.language = recognizer.language;
|
||||
if ('default' === this.language || !this.language) this.language = this.callSession.speechRecognizerLanguage;
|
||||
this.interim = !!recognizer.interim;
|
||||
this.separateRecognitionPerChannel = recognizer.separateRecognitionPerChannel;
|
||||
|
||||
/* google-specific options */
|
||||
this.hints = recognizer.hints || [];
|
||||
this.profanityFilter = recognizer.profanityFilter;
|
||||
this.punctuation = !!recognizer.punctuation;
|
||||
this.enhancedModel = !!recognizer.enhancedModel;
|
||||
this.words = !!recognizer.words;
|
||||
this.diarization = !!recognizer.diarization;
|
||||
this.diarizationMinSpeakers = recognizer.diarizationMinSpeakers || 0;
|
||||
this.diarizationMaxSpeakers = recognizer.diarizationMaxSpeakers || 0;
|
||||
this.interactionType = recognizer.interactionType || 'unspecified';
|
||||
this.naicsCode = recognizer.naicsCode || 0;
|
||||
this.altLanguages = recognizer.altLanguages || [];
|
||||
|
||||
/* aws-specific options */
|
||||
this.identifyChannels = !!recognizer.identifyChannels;
|
||||
this.vocabularyName = recognizer.vocabularyName;
|
||||
this.vocabularyFilterName = recognizer.vocabularyFilterName;
|
||||
this.filterMethod = recognizer.filterMethod;
|
||||
}
|
||||
|
||||
get name() { return TaskName.Transcribe; }
|
||||
@@ -27,15 +53,19 @@ class TaskTranscribe extends Task {
|
||||
} catch (err) {
|
||||
this.logger.info(err, 'TaskTranscribe:exec - error');
|
||||
}
|
||||
ep.removeCustomEventListener(TranscriptionEvents.Transcription);
|
||||
ep.removeCustomEventListener(TranscriptionEvents.NoAudioDetected);
|
||||
ep.removeCustomEventListener(TranscriptionEvents.MaxDurationExceeded);
|
||||
ep.removeCustomEventListener(GoogleTranscriptionEvents.Transcription);
|
||||
ep.removeCustomEventListener(GoogleTranscriptionEvents.NoAudioDetected);
|
||||
ep.removeCustomEventListener(GoogleTranscriptionEvents.MaxDurationExceeded);
|
||||
ep.removeCustomEventListener(AwsTranscriptionEvents.Transcription);
|
||||
ep.removeCustomEventListener(AwsTranscriptionEvents.NoAudioDetected);
|
||||
ep.removeCustomEventListener(AwsTranscriptionEvents.MaxDurationExceeded);
|
||||
}
|
||||
|
||||
async kill(cs) {
|
||||
super.kill(cs);
|
||||
if (this.ep.connected) {
|
||||
this.ep.stopTranscription().catch((err) => this.logger.info(err, 'Error TaskTranscribe:kill'));
|
||||
this.ep.stopTranscription({vendor: this.vendor})
|
||||
.catch((err) => this.logger.info(err, 'Error TaskTranscribe:kill'));
|
||||
|
||||
// hangup after 1 sec if we don't get a final transcription
|
||||
this._timer = setTimeout(() => this.notifyTaskDone(), 1000);
|
||||
@@ -45,34 +75,83 @@ class TaskTranscribe extends Task {
|
||||
}
|
||||
|
||||
async _startTranscribing(ep) {
|
||||
const opts = {
|
||||
GOOGLE_SPEECH_USE_ENHANCED: true,
|
||||
GOOGLE_SPEECH_MODEL: 'phone_call'
|
||||
};
|
||||
if (this.hints) {
|
||||
Object.assign(opts, {'GOOGLE_SPEECH_HINTS': this.hints.join(',')});
|
||||
}
|
||||
if (this.profanityFilter) {
|
||||
Object.assign(opts, {'GOOGLE_SPEECH_PROFANITY_FILTER': true});
|
||||
}
|
||||
if (this.dualChannel) {
|
||||
Object.assign(opts, {'GOOGLE_SPEECH_SEPARATE_RECOGNITION_PER_CHANNEL': true});
|
||||
}
|
||||
await ep.set(opts)
|
||||
.catch((err) => this.logger.info(err, 'TaskTranscribe:_startTranscribing'));
|
||||
const opts = {};
|
||||
|
||||
ep.addCustomEventListener(TranscriptionEvents.Transcription, this._onTranscription.bind(this, ep));
|
||||
ep.addCustomEventListener(TranscriptionEvents.NoAudioDetected, this._onNoAudio.bind(this, ep));
|
||||
ep.addCustomEventListener(TranscriptionEvents.MaxDurationExceeded, this._onMaxDurationExceeded.bind(this, ep));
|
||||
ep.addCustomEventListener(GoogleTranscriptionEvents.Transcription, this._onTranscription.bind(this, ep));
|
||||
ep.addCustomEventListener(GoogleTranscriptionEvents.NoAudioDetected, this._onNoAudio.bind(this, ep));
|
||||
ep.addCustomEventListener(GoogleTranscriptionEvents.MaxDurationExceeded,
|
||||
this._onMaxDurationExceeded.bind(this, ep));
|
||||
ep.addCustomEventListener(AwsTranscriptionEvents.Transcription, this._onTranscription.bind(this, ep));
|
||||
ep.addCustomEventListener(AwsTranscriptionEvents.NoAudioDetected, this._onNoAudio.bind(this, ep));
|
||||
ep.addCustomEventListener(AwsTranscriptionEvents.MaxDurationExceeded,
|
||||
this._onMaxDurationExceeded.bind(this, ep));
|
||||
|
||||
if (this.vendor === 'google') {
|
||||
[
|
||||
['enhancedModel', 'GOOGLE_SPEECH_USE_ENHANCED'],
|
||||
['separateRecognitionPerChannel', 'GOOGLE_SPEECH_SEPARATE_RECOGNITION_PER_CHANNEL'],
|
||||
['profanityFilter', 'GOOGLE_SPEECH_PROFANITY_FILTER'],
|
||||
['punctuation', 'GOOGLE_SPEECH_ENABLE_AUTOMATIC_PUNCTUATION'],
|
||||
['words', 'GOOGLE_SPEECH_ENABLE_WORD_TIME_OFFSETS'],
|
||||
['diarization', 'GOOGLE_SPEECH_PROFANITY_FILTER']
|
||||
].forEach((arr) => {
|
||||
if (this[arr[0]]) opts[arr[1]] = true;
|
||||
});
|
||||
if (this.hints.length > 1) opts.GOOGLE_SPEECH_HINTS = this.hints.join(',');
|
||||
if (this.altLanguages.length > 1) opts.GOOGLE_SPEECH_ALTERNATIVE_LANGUAGE_CODES = this.altLanguages.join(',');
|
||||
if ('unspecified' !== this.interactionType) {
|
||||
opts.GOOGLE_SPEECH_METADATA_INTERACTION_TYPE = this.interactionType;
|
||||
|
||||
// additionally set model if appropriate
|
||||
if ('phone_call' === this.interactionType) opts.GOOGLE_SPEECH_MODEL = 'phone_call';
|
||||
else if (['voice_search', 'voice_command'].includes(this.interactionType)) {
|
||||
opts.GOOGLE_SPEECH_MODEL = 'command_and_search';
|
||||
}
|
||||
else opts.GOOGLE_SPEECH_MODEL = 'phone_call';
|
||||
}
|
||||
else opts.GOOGLE_SPEECH_MODEL = 'phone_call';
|
||||
if (this.diarization && this.diarizationMinSpeakers > 0) {
|
||||
opts.GOOGLE_SPEECH_SPEAKER_DIARIZATION_MIN_SPEAKER_COUNT = this.diarizationMinSpeakers;
|
||||
}
|
||||
if (this.diarization && this.diarizationMaxSpeakers > 0) {
|
||||
opts.GOOGLE_SPEECH_SPEAKER_DIARIZATION_MAX_SPEAKER_COUNT = this.diarizationMaxSpeakers;
|
||||
}
|
||||
if (this.naicsCode > 0) opts.GOOGLE_SPEECH_METADATA_INDUSTRY_NAICS_CODE = this.naicsCode;
|
||||
|
||||
await ep.set(opts)
|
||||
.catch((err) => this.logger.info(err, 'TaskTranscribe:_startTranscribing with google'));
|
||||
}
|
||||
else if (this.vendor === 'aws') {
|
||||
[
|
||||
['diarization', 'AWS_SHOW_SPEAKER_LABEL'],
|
||||
['identifyChannels', 'AWS_ENABLE_CHANNEL_IDENTIFICATION']
|
||||
].forEach((arr) => {
|
||||
if (this[arr[0]]) opts[arr[1]] = true;
|
||||
});
|
||||
if (this.vocabularyName) opts.AWS_VOCABULARY_NAME = this.vocabularyName;
|
||||
if (this.vocabularyFilterName) {
|
||||
opts.AWS_VOCABULARY_NAME = this.vocabularyFilterName;
|
||||
opts.AWS_VOCABULARY_FILTER_METHOD = this.filterMethod || 'mask';
|
||||
}
|
||||
|
||||
Object.assign(opts, {
|
||||
AWS_ACCESS_KEY_ID: process.env.AWS_ACCESS_KEY_ID,
|
||||
AWS_SECRET_ACCESS_KEY: process.env.AWS_SECRET_ACCESS_KEY,
|
||||
AWS_REGION: process.env.AWS_REGION
|
||||
});
|
||||
|
||||
await ep.set(opts)
|
||||
.catch((err) => this.logger.info(err, 'TaskTranscribe:_startTranscribing with aws'));
|
||||
}
|
||||
await this._transcribe(ep);
|
||||
}
|
||||
|
||||
async _transcribe(ep) {
|
||||
await this.ep.startTranscription({
|
||||
await ep.startTranscription({
|
||||
vendor: this.vendor,
|
||||
interim: this.interim ? true : false,
|
||||
language: this.language || this.callSession.speechRecognizerLanguage,
|
||||
channels: this.dualChannel ? 2 : 1
|
||||
language: this.language,
|
||||
channels: this.separateRecognitionPerChannel ? 2 : 1
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user