mirror of
https://github.com/jambonz/jambonz-feature-server.git
synced 2025-12-20 16:50:39 +00:00
Feature/ibm watson (#193)
* initial changes to support ibm watson * update specs.json for ibm * update to drachtio-fsmrf with support for ibm * bugfix: set access token for ibm stt, not api_key * fix name of api_key * normalize ibm transcription results * rework ibm credentials * bugfix setting runtime speech creds * bugfix: ibm region * typo * changes to transcribe for ibm watson * implement connect handler * bugfix: bind error * proper use of result_index * ibm error handling
This commit is contained in:
@@ -575,6 +575,12 @@ class CallSession extends Emitter {
|
||||
api_key: credential.api_key
|
||||
};
|
||||
}
|
||||
else if ('ibm' === vendor) {
|
||||
return {
|
||||
speech_credential_sid: credential.speech_credential_sid,
|
||||
api_key: credential.api_key
|
||||
};
|
||||
}
|
||||
}
|
||||
else {
|
||||
writeAlerts({
|
||||
|
||||
@@ -6,7 +6,8 @@ const {
|
||||
NuanceTranscriptionEvents,
|
||||
AwsTranscriptionEvents,
|
||||
AzureTranscriptionEvents,
|
||||
DeepgramTranscriptionEvents
|
||||
DeepgramTranscriptionEvents,
|
||||
IbmTranscriptionEvents
|
||||
} = require('../utils/constants');
|
||||
|
||||
const makeTask = require('./make_task');
|
||||
@@ -30,7 +31,8 @@ class TaskGather extends Task {
|
||||
const {
|
||||
setChannelVarsForStt,
|
||||
normalizeTranscription,
|
||||
removeSpeechListeners
|
||||
removeSpeechListeners,
|
||||
setSpeechCredentialsAtRuntime
|
||||
} = require('../utils/transcription-utils')(logger);
|
||||
this.setChannelVarsForStt = setChannelVarsForStt;
|
||||
this.normalizeTranscription = normalizeTranscription;
|
||||
@@ -56,14 +58,7 @@ class TaskGather extends Task {
|
||||
this.language = recognizer.language;
|
||||
|
||||
/* let credentials be supplied in the recognizer object at runtime */
|
||||
if (recognizer.vendor === 'nuance') {
|
||||
const {clientId, secret} = recognizer.nuanceOptions;
|
||||
if (clientId && secret) this.sttCredentials = {client_id: clientId, secret};
|
||||
}
|
||||
else if (recognizer.vendor === 'deepgram') {
|
||||
const {apiKey} = recognizer.deepgramOptions;
|
||||
if (apiKey) this.sttCredentials = {api_key: apiKey};
|
||||
}
|
||||
this.sttCredentials = setSpeechCredentialsAtRuntime(recognizer);
|
||||
|
||||
/* continuous ASR (i.e. compile transcripts until a special timeout or dtmf key) */
|
||||
this.asrTimeout = typeof recognizer.asrTimeout === 'number' ? recognizer.asrTimeout * 1000 : 0;
|
||||
@@ -120,7 +115,7 @@ class TaskGather extends Task {
|
||||
this.logger.debug('Gather:exec');
|
||||
await super.exec(cs);
|
||||
const {updateSpeechCredentialLastUsed} = require('../utils/db-utils')(this.logger, cs.srf);
|
||||
const {getNuanceAccessToken} = cs.srf.locals.dbHelpers;
|
||||
const {getNuanceAccessToken, getIbmAccessToken} = cs.srf.locals.dbHelpers;
|
||||
|
||||
if (cs.hasGlobalSttHints) {
|
||||
const {hints, hintsBoost} = cs.globalSttHints;
|
||||
@@ -173,6 +168,13 @@ class TaskGather extends Task {
|
||||
this.logger.debug({client_id}, `Gather:exec - got nuance access token ${servedFromCache ? 'from cache' : ''}`);
|
||||
this.sttCredentials = {...this.sttCredentials, access_token};
|
||||
}
|
||||
else if (this.vendor == 'ibm' && this.sttCredentials.stt_api_key) {
|
||||
/* get ibm access token */
|
||||
const {stt_api_key, stt_region} = this.sttCredentials;
|
||||
const {access_token, servedFromCache} = await getIbmAccessToken(stt_api_key);
|
||||
this.logger.debug({stt_api_key}, `Gather:exec - got ibm access token ${servedFromCache ? 'from cache' : ''}`);
|
||||
this.sttCredentials = {...this.sttCredentials, access_token, stt_region};
|
||||
}
|
||||
const startListening = (cs, ep) => {
|
||||
this._startTimer();
|
||||
if (this.isContinuousAsr && 0 === this.timeout) this._startAsrTimer();
|
||||
@@ -352,6 +354,16 @@ class TaskGather extends Task {
|
||||
this._onDeepGramConnectFailure.bind(this, cs, ep));
|
||||
break;
|
||||
|
||||
case 'ibm':
|
||||
this.bugname = 'ibm_transcribe';
|
||||
ep.addCustomEventListener(IbmTranscriptionEvents.Transcription, this._onTranscription.bind(this, cs, ep));
|
||||
ep.addCustomEventListener(IbmTranscriptionEvents.Connect, this._onIbmConnect.bind(this, cs, ep));
|
||||
ep.addCustomEventListener(IbmTranscriptionEvents.ConnectFailure,
|
||||
this._onIbmConnectFailure.bind(this, cs, ep));
|
||||
ep.addCustomEventListener(IbmTranscriptionEvents.Error,
|
||||
this._onIbmError.bind(this, cs, ep));
|
||||
break;
|
||||
|
||||
default:
|
||||
throw new Error(`Invalid vendor ${this.vendor}`);
|
||||
}
|
||||
@@ -579,6 +591,27 @@ class TaskGather extends Task {
|
||||
this.notifyTaskDone();
|
||||
}
|
||||
|
||||
_onIbmConnect(_cs, _ep) {
|
||||
this.logger.debug('TaskGather:_onIbmConnect');
|
||||
}
|
||||
|
||||
_onIbmConnectFailure(cs, _ep, evt) {
|
||||
const {reason} = evt;
|
||||
const {writeAlerts, AlertType} = cs.srf.locals;
|
||||
this.logger.info({evt}, 'TaskGather:_onIbmConnectFailure');
|
||||
writeAlerts({
|
||||
account_sid: cs.accountSid,
|
||||
alert_type: AlertType.STT_FAILURE,
|
||||
message: `Failed connecting to IBM watson speech recognizer: ${reason}`,
|
||||
vendor: 'ibm',
|
||||
}).catch((err) => this.logger.info({err}, 'Error generating alert for IBM connection failure'));
|
||||
this.notifyError(`Failed connecting to speech vendor IBM: ${reason}`);
|
||||
this.notifyTaskDone();
|
||||
}
|
||||
|
||||
_onIbmError(cs, _ep, evt) {
|
||||
this.logger.info({evt}, 'TaskGather:_onIbmError'); }
|
||||
|
||||
_onVadDetected(cs, ep) {
|
||||
if (this.bargein && this.minBargeinWordCount === 0) {
|
||||
this.logger.debug('TaskGather:_onVadDetected');
|
||||
|
||||
@@ -424,7 +424,7 @@
|
||||
"properties": {
|
||||
"vendor": {
|
||||
"type": "string",
|
||||
"enum": ["google", "aws", "polly", "microsoft", "default"]
|
||||
"enum": ["google", "aws", "polly", "microsoft", "nuance", "ibm", "default"]
|
||||
},
|
||||
"language": "string",
|
||||
"voice": "string",
|
||||
@@ -445,7 +445,7 @@
|
||||
"properties": {
|
||||
"vendor": {
|
||||
"type": "string",
|
||||
"enum": ["google", "aws", "microsoft", "nuance", "deepgram", "default"]
|
||||
"enum": ["google", "aws", "microsoft", "nuance", "deepgram", "ibm", "default"]
|
||||
},
|
||||
"language": "string",
|
||||
"vad": "#vad",
|
||||
@@ -511,12 +511,30 @@
|
||||
"asrDtmfTerminationDigit": "string",
|
||||
"asrTimeout": "number",
|
||||
"nuanceOptions": "#nuanceOptions",
|
||||
"deepgramOptions": "#deepgramOptions"
|
||||
"deepgramOptions": "#deepgramOptions",
|
||||
"ibmOptions": "#ibmOptions"
|
||||
},
|
||||
"required": [
|
||||
"vendor"
|
||||
]
|
||||
},
|
||||
"ibmOptions": {
|
||||
"properties": {
|
||||
"sttApiKey": "string",
|
||||
"sttRegion": "string",
|
||||
"ttsApiKey": "string",
|
||||
"ttsRegion": "string",
|
||||
"instanceId": "string",
|
||||
"model": "string",
|
||||
"languageCustomizationId": "string",
|
||||
"acousticCustomizationId": "string",
|
||||
"baseModelVersion": "string",
|
||||
"watsonMetadata": "string",
|
||||
"watsonLearningOptOut": "boolean"
|
||||
},
|
||||
"required": [
|
||||
]
|
||||
},
|
||||
"deepgramOptions": {
|
||||
"properties": {
|
||||
"apiKey": "string",
|
||||
|
||||
@@ -6,7 +6,8 @@ const {
|
||||
AzureTranscriptionEvents,
|
||||
AwsTranscriptionEvents,
|
||||
NuanceTranscriptionEvents,
|
||||
DeepgramTranscriptionEvents
|
||||
DeepgramTranscriptionEvents,
|
||||
IbmTranscriptionEvents
|
||||
} = require('../utils/constants');
|
||||
const normalizeJambones = require('../utils/normalize-jambones');
|
||||
|
||||
@@ -19,7 +20,8 @@ class TaskTranscribe extends Task {
|
||||
const {
|
||||
setChannelVarsForStt,
|
||||
normalizeTranscription,
|
||||
removeSpeechListeners
|
||||
removeSpeechListeners,
|
||||
setSpeechCredentialsAtRuntime
|
||||
} = require('../utils/transcription-utils')(logger);
|
||||
this.setChannelVarsForStt = setChannelVarsForStt;
|
||||
this.normalizeTranscription = normalizeTranscription;
|
||||
@@ -35,16 +37,7 @@ class TaskTranscribe extends Task {
|
||||
this.separateRecognitionPerChannel = recognizer.separateRecognitionPerChannel;
|
||||
|
||||
/* let credentials be supplied in the recognizer object at runtime */
|
||||
if (recognizer.vendor === 'nuance') {
|
||||
const {clientId, secret} = recognizer.nuanceOptions;
|
||||
if (clientId && secret) {
|
||||
this.sttCredentials = {client_id: clientId, secret};
|
||||
}
|
||||
}
|
||||
else if (recognizer.vendor === 'deepgram') {
|
||||
const {apiKey} = recognizer.deepgramOptions;
|
||||
if (apiKey) this.sttCredentials = {api_key: apiKey};
|
||||
}
|
||||
this.sttCredentials = setSpeechCredentialsAtRuntime(recognizer);
|
||||
|
||||
recognizer.hints = recognizer.hints || [];
|
||||
recognizer.altLanguages = recognizer.altLanguages || [];
|
||||
@@ -55,7 +48,7 @@ class TaskTranscribe extends Task {
|
||||
async exec(cs, {ep, ep2}) {
|
||||
super.exec(cs);
|
||||
const {updateSpeechCredentialLastUsed} = require('../utils/db-utils')(this.logger, cs.srf);
|
||||
const {getNuanceAccessToken} = cs.srf.locals.dbHelpers;
|
||||
const {getNuanceAccessToken, getIbmAccessToken} = cs.srf.locals.dbHelpers;
|
||||
|
||||
if (cs.hasGlobalSttHints) {
|
||||
const {hints, hintsBoost} = cs.globalSttHints;
|
||||
@@ -102,7 +95,13 @@ class TaskTranscribe extends Task {
|
||||
`Transcribe:exec - got nuance access token ${servedFromCache ? 'from cache' : ''}`);
|
||||
this.sttCredentials = {...this.sttCredentials, access_token};
|
||||
}
|
||||
|
||||
else if (this.vendor == 'ibm' && this.sttCredentials.stt_api_key) {
|
||||
/* get ibm access token */
|
||||
const {stt_api_key, stt_region} = this.sttCredentials;
|
||||
const {access_token, servedFromCache} = await getIbmAccessToken(stt_api_key);
|
||||
this.logger.debug({stt_api_key}, `Gather:exec - got ibm access token ${servedFromCache ? 'from cache' : ''}`);
|
||||
this.sttCredentials = {...this.sttCredentials, access_token, stt_region};
|
||||
}
|
||||
await this._startTranscribing(cs, ep, 1);
|
||||
if (this.separateRecognitionPerChannel && ep2) {
|
||||
await this._startTranscribing(cs, ep2, 2);
|
||||
@@ -189,6 +188,19 @@ class TaskTranscribe extends Task {
|
||||
ep.addCustomEventListener(DeepgramTranscriptionEvents.ConnectFailure,
|
||||
this._onDeepGramConnectFailure.bind(this, cs, ep, channel));
|
||||
break;
|
||||
|
||||
case 'ibm':
|
||||
this.bugname = 'ibm_transcribe';
|
||||
ep.addCustomEventListener(IbmTranscriptionEvents.Transcription,
|
||||
this._onTranscription.bind(this, cs, ep, channel));
|
||||
ep.addCustomEventListener(IbmTranscriptionEvents.Connect,
|
||||
this._onIbmConnect.bind(this, cs, ep, channel));
|
||||
ep.addCustomEventListener(IbmTranscriptionEvents.ConnectFailure,
|
||||
this._onIbmConnectFailure.bind(this, cs, ep, channel));
|
||||
ep.addCustomEventListener(IbmTranscriptionEvents.Error,
|
||||
this._onIbmError.bind(this, cs, ep, channel));
|
||||
break;
|
||||
|
||||
default:
|
||||
throw new Error(`Invalid vendor ${this.vendor}`);
|
||||
}
|
||||
@@ -279,7 +291,7 @@ class TaskTranscribe extends Task {
|
||||
this._timer = null;
|
||||
}
|
||||
}
|
||||
_onNuanceError(_cs, _ep, evt) {
|
||||
_onNuanceError(_cs, _ep, _channel, evt) {
|
||||
const {code, error, details} = evt;
|
||||
if (code === 404 && error === 'No speech') {
|
||||
this.logger.debug({code, error, details}, 'TaskTranscribe:_onNuanceError');
|
||||
@@ -294,7 +306,7 @@ class TaskTranscribe extends Task {
|
||||
this.logger.debug('TaskTranscribe:_onDeepgramConnect');
|
||||
}
|
||||
|
||||
_onDeepGramConnectFailure(cs, _ep, evt) {
|
||||
_onDeepGramConnectFailure(cs, _ep, _channel, evt) {
|
||||
const {reason} = evt;
|
||||
const {writeAlerts, AlertType} = cs.srf.locals;
|
||||
this.logger.info({evt}, 'TaskTranscribe:_onDeepgramConnectFailure');
|
||||
@@ -307,6 +319,29 @@ class TaskTranscribe extends Task {
|
||||
this.notifyError(`Failed connecting to speech vendor deepgram: ${reason}`);
|
||||
this.notifyTaskDone();
|
||||
}
|
||||
|
||||
_onIbmConnect(_cs, _ep) {
|
||||
this.logger.debug('TaskTranscribe:_onIbmConnect');
|
||||
}
|
||||
|
||||
_onIbmConnectFailure(cs, _ep, _channel, evt) {
|
||||
const {reason} = evt;
|
||||
const {writeAlerts, AlertType} = cs.srf.locals;
|
||||
this.logger.info({evt}, 'TaskTranscribe:_onIbmConnectFailure');
|
||||
writeAlerts({
|
||||
account_sid: cs.accountSid,
|
||||
alert_type: AlertType.STT_FAILURE,
|
||||
message: `Failed connecting to IBM watson speech recognizer: ${reason}`,
|
||||
vendor: 'ibm',
|
||||
}).catch((err) => this.logger.info({err}, 'Error generating alert for IBM connection failure'));
|
||||
this.notifyError(`Failed connecting to speech vendor IBM: ${reason}`);
|
||||
this.notifyTaskDone();
|
||||
}
|
||||
_onIbmError(cs, _ep, _channel, evt) {
|
||||
this.logger.info({evt}, 'TaskGather:_onIbmError');
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
module.exports = TaskTranscribe;
|
||||
|
||||
@@ -79,6 +79,12 @@
|
||||
"ConnectFailure": "deepgram_transcribe::connect_failed",
|
||||
"Connect": "deepgram_transcribe::connect"
|
||||
},
|
||||
"IbmTranscriptionEvents": {
|
||||
"Transcription": "ibm_transcribe::transcription",
|
||||
"ConnectFailure": "ibm_transcribe::connect_failed",
|
||||
"Connect": "ibm_transcribe::connect",
|
||||
"Error": "ibm_transcribe::error"
|
||||
},
|
||||
"AwsTranscriptionEvents": {
|
||||
"Transcription": "aws_transcribe::transcription",
|
||||
"EndOfTranscript": "aws_transcribe::end_of_transcript",
|
||||
|
||||
@@ -153,7 +153,8 @@ function installSrfLocals(srf, logger) {
|
||||
removeFromList,
|
||||
lengthOfList,
|
||||
getListPosition,
|
||||
getNuanceAccessToken
|
||||
getNuanceAccessToken,
|
||||
getIbmAccessToken,
|
||||
} = require('@jambonz/realtimedb-helpers')({
|
||||
host: process.env.JAMBONES_REDIS_HOST,
|
||||
port: process.env.JAMBONES_REDIS_PORT || 6379
|
||||
@@ -206,7 +207,8 @@ function installSrfLocals(srf, logger) {
|
||||
removeFromList,
|
||||
lengthOfList,
|
||||
getListPosition,
|
||||
getNuanceAccessToken
|
||||
getNuanceAccessToken,
|
||||
getIbmAccessToken
|
||||
},
|
||||
parentLogger: logger,
|
||||
getSBC,
|
||||
|
||||
@@ -27,6 +27,23 @@ const normalizeDeepgram = (evt, channel, language) => {
|
||||
};
|
||||
};
|
||||
|
||||
const normalizeIbm = (evt, channel, language) => {
|
||||
const copy = JSON.parse(JSON.stringify(evt));
|
||||
//const idx = evt.result_index;
|
||||
const result = evt.results[0];
|
||||
|
||||
return {
|
||||
language_code: language,
|
||||
channel_tag: channel,
|
||||
is_final: result.final,
|
||||
alternatives: result.alternatives,
|
||||
vendor: {
|
||||
name: 'ibm',
|
||||
evt: copy
|
||||
}
|
||||
};
|
||||
};
|
||||
|
||||
const normalizeGoogle = (evt, channel, language) => {
|
||||
const copy = JSON.parse(JSON.stringify(evt));
|
||||
return {
|
||||
@@ -113,6 +130,8 @@ module.exports = (logger) => {
|
||||
return normalizeAws(evt, channel, language);
|
||||
case 'nuance':
|
||||
return normalizeNuance(evt, channel, language);
|
||||
case 'ibm':
|
||||
return normalizeIbm(evt, channel, language);
|
||||
default:
|
||||
logger.error(`Unknown vendor ${vendor}`);
|
||||
return evt;
|
||||
@@ -311,6 +330,31 @@ module.exports = (logger) => {
|
||||
{DEEPGRAM_SPEECH_VAD_TURNOFF: deepgramOptions.tag}
|
||||
};
|
||||
}
|
||||
else if ('ibm' === rOpts.vendor) {
|
||||
const {ibmOptions = {}} = rOpts;
|
||||
opts = {
|
||||
...opts,
|
||||
...(sttCredentials.access_token) &&
|
||||
{IBM_ACCESS_TOKEN: sttCredentials.access_token},
|
||||
...(sttCredentials.stt_region) &&
|
||||
{IBM_SPEECH_REGION: sttCredentials.stt_region},
|
||||
...(sttCredentials.instance_id) &&
|
||||
{IBM_SPEECH_INSTANCE_ID: sttCredentials.instance_id},
|
||||
...(ibmOptions.model) &&
|
||||
{IBM_SPEECH_MODEL: ibmOptions.model},
|
||||
...(ibmOptions.language_customization_id) &&
|
||||
{IBM_SPEECH_LANGUAGE_CUSTOMIZATION_ID: ibmOptions.language_customization_id},
|
||||
...(ibmOptions.acoustic_customization_id) &&
|
||||
{IBM_SPEECH_ACOUSTIC_CUSTOMIZATION_ID: ibmOptions.acoustic_customization_id},
|
||||
...(ibmOptions.baseModelVersion) &&
|
||||
{IBM_SPEECH_BASE_MODEL_VERSION: ibmOptions.baseModelVersion},
|
||||
...(ibmOptions.watsonMetadata) &&
|
||||
{IBM_SPEECH_WATSON_METADATA: ibmOptions.watsonMetadata},
|
||||
...(ibmOptions.watsonLearningOptOut) &&
|
||||
{IBM_SPEECH_WATSON_LEARNING_OPT_OUT: ibmOptions.watsonLearningOptOut}
|
||||
};
|
||||
}
|
||||
|
||||
logger.debug({opts}, 'recognizer channel vars');
|
||||
return opts;
|
||||
};
|
||||
@@ -336,12 +380,34 @@ module.exports = (logger) => {
|
||||
ep.removeCustomEventListener(DeepgramTranscriptionEvents.Transcription);
|
||||
ep.removeCustomEventListener(DeepgramTranscriptionEvents.Connect);
|
||||
ep.removeCustomEventListener(DeepgramTranscriptionEvents.ConnectFailure);
|
||||
|
||||
|
||||
};
|
||||
|
||||
const setSpeechCredentialsAtRuntime = (recognizer) => {
|
||||
if (!recognizer) return;
|
||||
if (recognizer.vendor === 'nuance') {
|
||||
const {clientId, secret} = recognizer.nuanceOptions;
|
||||
if (clientId && secret) return {client_id: clientId, secret};
|
||||
}
|
||||
else if (recognizer.vendor === 'deepgram') {
|
||||
const {apiKey} = recognizer.deepgramOptions;
|
||||
if (apiKey) return {api_key: apiKey};
|
||||
}
|
||||
else if (recognizer.vendor === 'ibm') {
|
||||
const {ttsApiKey, ttsRegion, sttApiKey, sttRegion, instanceId} = recognizer.ibmOptions;
|
||||
if (ttsApiKey || sttApiKey) return {
|
||||
tts_api_key: ttsApiKey,
|
||||
tts_region: ttsRegion,
|
||||
stt_api_key: sttApiKey,
|
||||
stt_region: sttRegion,
|
||||
instance_id: instanceId
|
||||
};
|
||||
}
|
||||
};
|
||||
|
||||
return {
|
||||
normalizeTranscription,
|
||||
setChannelVarsForStt,
|
||||
removeSpeechListeners
|
||||
removeSpeechListeners,
|
||||
setSpeechCredentialsAtRuntime
|
||||
};
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user