Files
jambonz-feature-server/lib/utils/transcription-utils.js
Dave Horton 509bb065bb Feature/nuance stt (#185)
* initial changes to gather to support nuance stt

* updateSpeechCredentialLastUsed could be called without a speech_credential_sid if credentials are passed in the flow

* fix bugname

* typo

* added handlers for nuance

* logging

* major refactor of parsing transcriptions

* initial support for nuance in transcribe verb

* updates from testing

* cleanup some tests

* update action

* typo

* gather: start nuance timers after say/play completes

* update drachtio-fsrmf

* refactor some code

* typo

* log nuance error detail

* timeout handling

* typo

* handle nuance 413 response when recognition times out

* typo in specs.json

* add support for nuance resources

* fixes and tests for transcribe

* remove logging from test

* initial support for kryptonEndpoint

* try getting access token even when using krypton

* typo in kryptonEndpoint property

* add support for Nuance tts

* parse nuance voice and model for tts

* use nuance credentials from db

* update to db-helpers@0.7.0 with caching option

* add support for azure audio logging in gather/transcribe

* sync package-lock.json
2022-11-01 12:23:49 -04:00

233 lines
10 KiB
JavaScript

const {
TaskName,
AzureTranscriptionEvents,
GoogleTranscriptionEvents,
AwsTranscriptionEvents,
NuanceTranscriptionEvents
} = require('./constants');
module.exports = (logger) => {
const normalizeTranscription = (evt, vendor, channel, language) => {
let newEvent = JSON.parse(JSON.stringify(evt));
/* add in channel_tag and provide the full vendor-specific event */
newEvent = {
...(vendor === 'aws' ? newEvent[0] : newEvent),
language_code: language,
channel_tag: channel
};
if ('aws' === vendor && Array.isArray(evt) && evt.length > 0) {
newEvent = {
...newEvent,
vendor: {event: evt, name: vendor}
};
}
else if ('microsoft' === vendor) {
const nbest = evt.NBest;
const language_code = evt.PrimaryLanguage?.Language || language;
const alternatives = nbest ? nbest.map((n) => {
return {
confidence: n.Confidence,
transcript: n.Display
};
}) :
[
{
transcript: evt.DisplayText || evt.Text
}
];
newEvent = {
...newEvent,
is_final: evt.RecognitionStatus === 'Success',
channel,
language_code,
alternatives,
vendor: {event: evt, name: vendor}
};
}
return newEvent;
};
const setChannelVarsForStt = (task, sttCredentials, rOpts = {}) => {
let opts = {};
const {enable, voiceMs = 0, mode = -1} = rOpts.vad || {};
const vad = {enable, voiceMs, mode};
/* voice activity detection works across vendors */
opts = {
...opts,
...(vad.enable && {START_RECOGNIZING_ON_VAD: 1}),
...(vad.enable && vad.voiceMs && {RECOGNIZER_VAD_VOICE_MS: vad.voiceMs}),
...(vad.enable && typeof vad.mode === 'number' && {RECOGNIZER_VAD_MODE: vad.mode}),
};
if ('google' === rOpts.vendor) {
opts = {
...opts,
...(sttCredentials &&
{GOOGLE_APPLICATION_CREDENTIALS: JSON.stringify(sttCredentials.credentials)}),
...(rOpts.enhancedModel &&
{GOOGLE_SPEECH_USE_ENHANCED: 1}),
...(rOpts.separateRecognitionPerChannel &&
{GOOGLE_SPEECH_SEPARATE_RECOGNITION_PER_CHANNEL: 1}),
...(rOpts.profanityFilter &&
{GOOGLE_SPEECH_PROFANITY_FILTER: 1}),
...(rOpts.punctuation &&
{GOOGLE_SPEECH_ENABLE_AUTOMATIC_PUNCTUATION: 1}),
...(rOpts.words &&
{GOOGLE_SPEECH_ENABLE_WORD_TIME_OFFSETS: 1}),
...((rOpts.singleUtterance || task.name === TaskName.Gather) &&
{GOOGLE_SPEECH_SINGLE_UTTERANCE: 1}),
...(rOpts.diarization &&
{GOOGLE_SPEECH_SPEAKER_DIARIZATION: 1}),
...(rOpts.diarization && rOpts.diarizationMinSpeakers > 0 &&
{GOOGLE_SPEECH_SPEAKER_DIARIZATION_MIN_SPEAKER_COUNT: rOpts.diarizationMinSpeakers}),
...(rOpts.diarization && rOpts.diarizationMaxSpeakers > 0 &&
{GOOGLE_SPEECH_SPEAKER_DIARIZATION_MAX_SPEAKER_COUNT: rOpts.diarizationMaxSpeakers}),
...(rOpts.enhancedModel === false &&
{GOOGLE_SPEECH_USE_ENHANCED: 0}),
...(rOpts.separateRecognitionPerChannel === false &&
{GOOGLE_SPEECH_SEPARATE_RECOGNITION_PER_CHANNEL: 0}),
...(rOpts.profanityFilter === false &&
{GOOGLE_SPEECH_PROFANITY_FILTER: 0}),
...(rOpts.punctuation === false &&
{GOOGLE_SPEECH_ENABLE_AUTOMATIC_PUNCTUATION: 0}),
...(rOpts.words == false &&
{GOOGLE_SPEECH_ENABLE_WORD_TIME_OFFSETS: 0}),
...((rOpts.singleUtterance === false || task.name === TaskName.Transcribe) &&
{GOOGLE_SPEECH_SINGLE_UTTERANCE: 0}),
...(rOpts.diarization === false &&
{GOOGLE_SPEECH_SPEAKER_DIARIZATION: 0}),
...(rOpts.hints.length > 0 &&
{GOOGLE_SPEECH_HINTS: rOpts.hints.join(',')}),
...(typeof rOpts.hintsBoost === 'number' &&
{GOOGLE_SPEECH_HINTS_BOOST: rOpts.hintsBoost}),
...(rOpts.altLanguages.length > 0 &&
{GOOGLE_SPEECH_ALTERNATIVE_LANGUAGE_CODES: rOpts.altLanguages.join(',')}),
...(rOpts.interactionType &&
{GOOGLE_SPEECH_METADATA_INTERACTION_TYPE: rOpts.interactionType}),
...{GOOGLE_SPEECH_MODEL: rOpts.model || (task.name === TaskName.Gather ? 'command_and_search' : 'phone_call')},
...(rOpts.naicsCode > 0 &&
{GOOGLE_SPEECH_METADATA_INDUSTRY_NAICS_CODE: rOpts.naicsCode}),
};
}
else if (['aws', 'polly'].includes(rOpts.vendor)) {
opts = {
...opts,
...(rOpts.vocabularyName && {AWS_VOCABULARY_NAME: rOpts.vocabularyName}),
...(rOpts.vocabularyFilterName && {AWS_VOCABULARY_FILTER_NAME: rOpts.vocabularyFilterName}),
...(rOpts.filterMethod && {AWS_VOCABULARY_FILTER_METHOD: rOpts.filterMethod}),
...(sttCredentials && {
AWS_ACCESS_KEY_ID: sttCredentials.accessKeyId,
AWS_SECRET_ACCESS_KEY: sttCredentials.secretAccessKey,
AWS_REGION: sttCredentials.region
}),
};
}
else if ('microsoft' === rOpts.vendor) {
opts = {
...opts,
...(rOpts.hints && rOpts.hints.length > 0 &&
{AZURE_SPEECH_HINTS: rOpts.hints.map((h) => h.trim()).join(',')}),
...(rOpts.altLanguages && rOpts.altLanguages.length > 0 &&
{AZURE_SERVICE_ENDPOINT_ID: rOpts.sttCredentials}),
...(rOpts.requestSnr && {AZURE_REQUEST_SNR: 1}),
...(rOpts.profanityOption && {AZURE_PROFANITY_OPTION: rOpts.profanityOption}),
...(rOpts.azureServiceEndpoint && {AZURE_SERVICE_ENDPOINT: rOpts.azureServiceEndpoint}),
...(rOpts.initialSpeechTimeoutMs > 0 &&
{AZURE_INITIAL_SPEECH_TIMEOUT_MS: rOpts.initialSpeechTimeoutMs}),
...(rOpts.requestSnr && {AZURE_REQUEST_SNR: 1}),
...(rOpts.audioLogging && {AZURE_AUDIO_LOGGING: 1}),
...{AZURE_USE_OUTPUT_FORMAT_DETAILED: 1},
...(sttCredentials && {
AZURE_SUBSCRIPTION_KEY: sttCredentials.api_key,
AZURE_REGION: sttCredentials.region,
}),
...(sttCredentials.use_custom_stt && sttCredentials.custom_stt_endpoint &&
{AZURE_SERVICE_ENDPOINT_ID: sttCredentials.custom_stt_endpoint})
};
}
else if ('nuance' === rOpts.vendor) {
/**
* Note: all nuance options are in recognizer.nuanceOptions, should migrate
* other vendor settings to similar nested structure
*/
const {nuanceOptions = {}} = rOpts;
opts = {
...opts,
...(sttCredentials.access_token) &&
{NUANCE_ACCESS_TOKEN: sttCredentials.access_token},
...(sttCredentials.krypton_endpoint) &&
{NUANCE_KRYPTON_ENDPOINT: sttCredentials.krypton_endpoint},
...(nuanceOptions.topic) &&
{NUANCE_TOPIC: nuanceOptions.topic},
...(nuanceOptions.utteranceDetectionMode) &&
{NUANCE_UTTERANCE_DETECTION_MODE: nuanceOptions.utteranceDetectionMode},
...(nuanceOptions.punctuation) && {NUANCE_PUNCTUATION: nuanceOptions.punctuation},
...(nuanceOptions.profanityFilter) &&
{NUANCE_FILTER_PROFANITY: nuanceOptions.profanityFilter},
...(nuanceOptions.includeTokenization) &&
{NUANCE_INCLUDE_TOKENIZATION: nuanceOptions.includeTokenization},
...(nuanceOptions.discardSpeakerAdaptation) &&
{NUANCE_DISCARD_SPEAKER_ADAPTATION: nuanceOptions.discardSpeakerAdaptation},
...(nuanceOptions.suppressCallRecording) &&
{NUANCE_SUPPRESS_CALL_RECORDING: nuanceOptions.suppressCallRecording},
...(nuanceOptions.maskLoadFailures) &&
{NUANCE_MASK_LOAD_FAILURES: nuanceOptions.maskLoadFailures},
...(nuanceOptions.suppressInitialCapitalization) &&
{NUANCE_SUPPRESS_INITIAL_CAPITALIZATION: nuanceOptions.suppressInitialCapitalization},
...(nuanceOptions.allowZeroBaseLmWeight)
&& {NUANCE_ALLOW_ZERO_BASE_LM_WEIGHT: nuanceOptions.allowZeroBaseLmWeight},
...(nuanceOptions.filterWakeupWord) &&
{NUANCE_FILTER_WAKEUP_WORD: nuanceOptions.filterWakeupWord},
...(nuanceOptions.resultType) &&
{NUANCE_RESULT_TYPE: nuanceOptions.resultType || rOpts.interim ? 'partial' : 'final'},
...(nuanceOptions.noInputTimeoutMs) &&
{NUANCE_NO_INPUT_TIMEOUT_MS: nuanceOptions.noInputTimeoutMs},
...(nuanceOptions.recognitionTimeoutMs) &&
{NUANCE_RECOGNITION_TIMEOUT_MS: nuanceOptions.recognitionTimeoutMs},
...(nuanceOptions.utteranceEndSilenceMs) &&
{NUANCE_UTTERANCE_END_SILENCE_MS: nuanceOptions.utteranceEndSilenceMs},
...(nuanceOptions.maxHypotheses) &&
{NUANCE_MAX_HYPOTHESES: nuanceOptions.maxHypotheses},
...(nuanceOptions.speechDomain) &&
{NUANCE_SPEECH_DOMAIN: nuanceOptions.speechDomain},
...(nuanceOptions.formatting) &&
{NUANCE_FORMATTING: nuanceOptions.formatting},
...(nuanceOptions.resources) &&
{NUANCE_RESOURCES: JSON.stringify(nuanceOptions.resources)},
};
}
logger.debug({opts}, 'recognizer channel vars');
return opts;
};
const removeSpeechListeners = (ep) => {
ep.removeCustomEventListener(GoogleTranscriptionEvents.Transcription);
ep.removeCustomEventListener(GoogleTranscriptionEvents.EndOfUtterance);
ep.removeCustomEventListener(GoogleTranscriptionEvents.VadDetected);
ep.removeCustomEventListener(AwsTranscriptionEvents.Transcription);
ep.removeCustomEventListener(AwsTranscriptionEvents.VadDetected);
ep.removeCustomEventListener(AzureTranscriptionEvents.Transcription);
ep.removeCustomEventListener(AzureTranscriptionEvents.NoSpeechDetected);
ep.removeCustomEventListener(AzureTranscriptionEvents.VadDetected);
ep.removeCustomEventListener(NuanceTranscriptionEvents.Transcription);
ep.removeCustomEventListener(NuanceTranscriptionEvents.TranscriptionComplete);
ep.removeCustomEventListener(NuanceTranscriptionEvents.StartOfSpeech);
ep.removeCustomEventListener(NuanceTranscriptionEvents.Error);
ep.removeCustomEventListener(NuanceTranscriptionEvents.VadDetected);
};
return {
normalizeTranscription,
setChannelVarsForStt,
removeSpeechListeners
};
};