mirror of
https://github.com/jambonz/jambonz-feature-server.git
synced 2025-12-20 16:50:39 +00:00
feat: rest:dial amd (#339)
Add support for sending 'amd' property in createCall REST API and also added support for using any of the speech vendors for STT --------- Co-authored-by: Dave Horton <daveh@beachdognet.com>
This commit is contained in:
@@ -1,9 +1,16 @@
|
||||
const Emitter = require('events');
|
||||
const {readFile} = require('fs');
|
||||
const {
|
||||
TaskName,
|
||||
GoogleTranscriptionEvents,
|
||||
AwsTranscriptionEvents,
|
||||
AzureTranscriptionEvents,
|
||||
NuanceTranscriptionEvents,
|
||||
NvidiaTranscriptionEvents,
|
||||
IbmTranscriptionEvents,
|
||||
SonioxTranscriptionEvents,
|
||||
DeepgramTranscriptionEvents,
|
||||
JambonzTranscriptionEvents,
|
||||
AmdEvents,
|
||||
AvmdEvents
|
||||
} = require('./constants');
|
||||
@@ -54,6 +61,11 @@ class Amd extends Emitter {
|
||||
this.thresholdWordCount = opts.thresholdWordCount || 9;
|
||||
const {normalizeTranscription} = require('./transcription-utils')(logger);
|
||||
this.normalizeTranscription = normalizeTranscription;
|
||||
const {getNuanceAccessToken, getIbmAccessToken} = cs.srf.locals.dbHelpers;
|
||||
this.getNuanceAccessToken = getNuanceAccessToken;
|
||||
this.getIbmAccessToken = getIbmAccessToken;
|
||||
const {setChannelVarsForStt} = require('./transcription-utils')(logger);
|
||||
this.setChannelVarsForStt = setChannelVarsForStt;
|
||||
|
||||
const {
|
||||
noSpeechTimeoutMs = 5000,
|
||||
@@ -229,51 +241,92 @@ module.exports = (logger) => {
|
||||
|
||||
const startAmd = async(cs, ep, task, opts) => {
|
||||
const amd = ep.amd = new Amd(logger, cs, opts);
|
||||
const {vendor, language, sttCredentials} = amd;
|
||||
const sttOpts = {};
|
||||
const {vendor, language} = amd;
|
||||
let sttCredentials = amd.sttCredentials;
|
||||
const hints = voicemailHints[language] || [];
|
||||
|
||||
if (vendor === 'nuance' && sttCredentials.client_id) {
|
||||
/* get nuance access token */
|
||||
const {getNuanceAccessToken} = amd;
|
||||
const {client_id, secret} = sttCredentials;
|
||||
const {access_token, servedFromCache} = await getNuanceAccessToken(client_id, secret, 'asr tts');
|
||||
logger.debug({client_id}, `Gather:exec - got nuance access token ${servedFromCache ? 'from cache' : ''}`);
|
||||
sttCredentials = {...sttCredentials, access_token};
|
||||
}
|
||||
else if (vendor == 'ibm' && sttCredentials.stt_api_key) {
|
||||
/* get ibm access token */
|
||||
const {getIbmAccessToken} = amd;
|
||||
const {stt_api_key, stt_region} = sttCredentials;
|
||||
const {access_token, servedFromCache} = await getIbmAccessToken(stt_api_key);
|
||||
logger.debug({stt_api_key}, `Gather:exec - got ibm access token ${servedFromCache ? 'from cache' : ''}`);
|
||||
sttCredentials = {...sttCredentials, access_token, stt_region};
|
||||
}
|
||||
|
||||
/* set stt options */
|
||||
logger.info(`starting amd for vendor ${vendor} and language ${language}`);
|
||||
if ('google' === vendor) {
|
||||
sttOpts.GOOGLE_APPLICATION_CREDENTIALS = JSON.stringify(sttCredentials.credentials);
|
||||
sttOpts.GOOGLE_SPEECH_USE_ENHANCED = true;
|
||||
sttOpts.GOOGLE_SPEECH_HINTS = hints.join(',');
|
||||
if (opts.recognizer?.altLanguages) {
|
||||
sttOpts.GOOGLE_SPEECH_ALTERNATIVE_LANGUAGE_CODES = opts.recognizer.altLanguages.join(',');
|
||||
}
|
||||
ep.addCustomEventListener(GoogleTranscriptionEvents.Transcription, onTranscription.bind(null, cs, ep, task));
|
||||
ep.addCustomEventListener(GoogleTranscriptionEvents.EndOfUtterance, onEndOfUtterance.bind(null, cs, ep, task));
|
||||
}
|
||||
else if (['aws', 'polly'].includes(vendor)) {
|
||||
Object.assign(sttOpts, {
|
||||
AWS_ACCESS_KEY_ID: sttCredentials.accessKeyId,
|
||||
AWS_SECRET_ACCESS_KEY: sttCredentials.secretAccessKey,
|
||||
AWS_REGION: sttCredentials.region
|
||||
});
|
||||
ep.addCustomEventListener(AwsTranscriptionEvents.Transcription, onTranscription.bind(null, cs, ep, task));
|
||||
}
|
||||
else if ('microsoft' === vendor) {
|
||||
Object.assign(sttOpts, {
|
||||
'AZURE_SUBSCRIPTION_KEY': sttCredentials.api_key,
|
||||
'AZURE_REGION': sttCredentials.region
|
||||
});
|
||||
sttOpts.AZURE_SPEECH_HINTS = hints.join(',');
|
||||
if (opts.recognizer?.altLanguages) {
|
||||
sttOpts.AZURE_SPEECH_ALTERNATIVE_LANGUAGE_CODES = opts.recognizer.altLanguages.join(',');
|
||||
}
|
||||
sttOpts.AZURE_INITIAL_SPEECH_TIMEOUT_MS = opts.resolveTimeoutMs || 20000;
|
||||
const sttOpts = amd.setChannelVarsForStt({name: TaskName.Gather}, sttCredentials, {
|
||||
vendor,
|
||||
hints,
|
||||
enhancedModel: true,
|
||||
altLanguages: opts.recognizer?.altLanguages || [],
|
||||
initialSpeechTimeoutMs: opts.resolveTimeoutMs,
|
||||
});
|
||||
|
||||
ep.addCustomEventListener(AzureTranscriptionEvents.Transcription, onTranscription.bind(null, cs, ep, task));
|
||||
ep.addCustomEventListener(AzureTranscriptionEvents.NoSpeechDetected, onNoSpeechDetected.bind(null, cs, ep, task));
|
||||
}
|
||||
await ep.set(sttOpts).catch((err) => logger.info(err, 'Error setting channel variables'));
|
||||
|
||||
amd.transcriptionHandler = onTranscription.bind(null, cs, ep, task);
|
||||
amd.EndOfUtteranceHandler = onEndOfUtterance.bind(null, cs, ep, task);
|
||||
amd.noSpeechHandler = onNoSpeechDetected.bind(null, cs, ep, task);
|
||||
|
||||
switch (vendor) {
|
||||
case 'google':
|
||||
ep.addCustomEventListener(GoogleTranscriptionEvents.Transcription, amd.transcriptionHandler);
|
||||
ep.addCustomEventListener(GoogleTranscriptionEvents.EndOfUtterance, amd.EndOfUtteranceHandler);
|
||||
break;
|
||||
|
||||
case 'aws':
|
||||
case 'polly':
|
||||
ep.addCustomEventListener(AwsTranscriptionEvents.Transcription, amd.transcriptionHandler);
|
||||
break;
|
||||
case 'microsoft':
|
||||
ep.addCustomEventListener(AzureTranscriptionEvents.Transcription, amd.transcriptionHandler);
|
||||
ep.addCustomEventListener(AzureTranscriptionEvents.NoSpeechDetected, amd.noSpeechHandler);
|
||||
break;
|
||||
case 'nuance':
|
||||
ep.addCustomEventListener(NuanceTranscriptionEvents.Transcription, amd.transcriptionHandler);
|
||||
break;
|
||||
|
||||
case 'deepgram':
|
||||
ep.addCustomEventListener(DeepgramTranscriptionEvents.Transcription, amd.transcriptionHandler);
|
||||
break;
|
||||
|
||||
case 'soniox':
|
||||
amd.bugname = 'soniox_amd_transcribe';
|
||||
ep.addCustomEventListener(SonioxTranscriptionEvents.Transcription, amd.transcriptionHandler);
|
||||
break;
|
||||
|
||||
case 'ibm':
|
||||
ep.addCustomEventListener(IbmTranscriptionEvents.Transcription, amd.transcriptionHandler);
|
||||
break;
|
||||
|
||||
case 'nvidia':
|
||||
ep.addCustomEventListener(NvidiaTranscriptionEvents.Transcription, amd.transcriptionHandler);
|
||||
break;
|
||||
|
||||
default:
|
||||
if (vendor.startsWith('custom:')) {
|
||||
ep.addCustomEventListener(JambonzTranscriptionEvents.Transcription, amd.transcriptionHandler);
|
||||
break;
|
||||
}
|
||||
else {
|
||||
throw new Error(`Invalid vendor ${this.vendor}`);
|
||||
}
|
||||
}
|
||||
amd
|
||||
.on(AmdEvents.NoSpeechDetected, (evt) => {
|
||||
task.emit('amd', {type: AmdEvents.NoSpeechDetected, ...evt});
|
||||
try {
|
||||
ep.connected && ep.stopTranscription({vendor, bugname});
|
||||
stopAmd(ep, task);
|
||||
} catch (err) {
|
||||
logger.info({err}, 'Error stopping transcription');
|
||||
}
|
||||
@@ -281,7 +334,7 @@ module.exports = (logger) => {
|
||||
.on(AmdEvents.HumanDetected, (evt) => {
|
||||
task.emit('amd', {type: AmdEvents.HumanDetected, ...evt});
|
||||
try {
|
||||
ep.connected && ep.stopTranscription({vendor, bugname});
|
||||
stopAmd(ep, task);
|
||||
} catch (err) {
|
||||
logger.info({err}, 'Error stopping transcription');
|
||||
}
|
||||
@@ -292,7 +345,7 @@ module.exports = (logger) => {
|
||||
.on(AmdEvents.DecisionTimeout, (evt) => {
|
||||
task.emit('amd', {type: AmdEvents.DecisionTimeout, ...evt});
|
||||
try {
|
||||
ep.connected && ep.stopTranscription({vendor, bugname});
|
||||
stopAmd(ep, task);
|
||||
} catch (err) {
|
||||
logger.info({err}, 'Error stopping transcription');
|
||||
}
|
||||
@@ -300,7 +353,7 @@ module.exports = (logger) => {
|
||||
.on(AmdEvents.ToneTimeout, (evt) => {
|
||||
//task.emit('amd', {type: AmdEvents.ToneTimeout, ...evt});
|
||||
try {
|
||||
ep.connected && ep.execute('avmd_stop').catch((err) => logger.info(err, 'Error stopping avmd'));
|
||||
stopAmd(ep, task);
|
||||
} catch (err) {
|
||||
logger.info({err}, 'Error stopping avmd');
|
||||
}
|
||||
@@ -308,7 +361,7 @@ module.exports = (logger) => {
|
||||
.on(AmdEvents.MachineStoppedSpeaking, () => {
|
||||
task.emit('amd', {type: AmdEvents.MachineStoppedSpeaking});
|
||||
try {
|
||||
ep.connected && ep.stopTranscription({vendor, bugname});
|
||||
stopAmd(ep, task);
|
||||
} catch (err) {
|
||||
logger.info({err}, 'Error stopping transcription');
|
||||
}
|
||||
@@ -327,6 +380,19 @@ module.exports = (logger) => {
|
||||
if (ep.amd) {
|
||||
vendor = ep.amd.vendor;
|
||||
ep.amd.stopAllTimers();
|
||||
|
||||
ep.removeListener(GoogleTranscriptionEvents.Transcription, ep.amd.transcriptionHandler);
|
||||
ep.removeListener(GoogleTranscriptionEvents.EndOfUtterance, ep.amd.EndOfUtteranceHandler);
|
||||
ep.removeListener(AwsTranscriptionEvents.Transcription, ep.amd.transcriptionHandler);
|
||||
ep.removeListener(AzureTranscriptionEvents.Transcription, ep.amd.transcriptionHandler);
|
||||
ep.removeListener(AzureTranscriptionEvents.NoSpeechDetected, ep.amd.noSpeechHandler);
|
||||
ep.removeListener(NuanceTranscriptionEvents.Transcription, ep.amd.transcriptionHandler);
|
||||
ep.removeListener(DeepgramTranscriptionEvents.Transcription, ep.amd.transcriptionHandler);
|
||||
ep.removeListener(SonioxTranscriptionEvents.Transcription, ep.amd.transcriptionHandler);
|
||||
ep.removeListener(IbmTranscriptionEvents.Transcription, ep.amd.transcriptionHandler);
|
||||
ep.removeListener(NvidiaTranscriptionEvents.Transcription, ep.amd.transcriptionHandler);
|
||||
ep.removeListener(JambonzTranscriptionEvents.Transcription, ep.amd.transcriptionHandler);
|
||||
|
||||
ep.amd = null;
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user