Feature/deepgram stt (#190)

* initial changes to support deepgram stt

* fixes for normalizing vendor-specific transcriptions

* update to latest drachtio-fsmrf with support for deepgram stt

* deepgram parsing error

* hints support for deepgram

* handling deepgram errors

* ignore late arriving transcripts for deepgram

* handling of empty transcripts

* transcribe changes

* allow deepgram stt credentials to be provided at run time

* bind channel in transcription handler

* fixes for transcribe when handling empty transcripts

* more empty transcript fixes

* update tests to latest modules

* add test cases for deepgram speech recognition
This commit is contained in:
Dave Horton
2022-11-12 19:48:59 -05:00
committed by GitHub
parent f511e6ab6b
commit 8686348454
12 changed files with 2148 additions and 152 deletions

View File

@@ -74,6 +74,11 @@
"Error": "nuance_transcribe::error",
"VadDetected": "nuance_transcribe::vad_detected"
},
"DeepgramTranscriptionEvents": {
"Transcription": "deepgram_transcribe::transcription",
"ConnectFailure": "deepgram_transcribe::connect_failed",
"Connect": "deepgram_transcribe::connect"
},
"AwsTranscriptionEvents": {
"Transcription": "aws_transcribe::transcription",
"EndOfTranscript": "aws_transcribe::end_of_transcript",

View File

@@ -51,6 +51,10 @@ const speechMapper = (cred) => {
obj.client_id = o.client_id;
obj.secret = o.secret;
}
else if ('deepgram' === obj.vendor) {
const o = JSON.parse(decrypt(credential));
obj.api_key = o.api_key;
}
} catch (err) {
console.log(err);
}
@@ -74,6 +78,7 @@ module.exports = (logger, srf) => {
const haveMicrosoft = speech.find((s) => s.vendor === 'microsoft');
const haveWellsaid = speech.find((s) => s.vendor === 'wellsaid');
const haveNuance = speech.find((s) => s.vendor === 'nuance');
const haveDeepgram = speech.find((s) => s.vendor === 'deepgram');
if (!haveGoogle || !haveAws || !haveMicrosoft || !haveWellsaid || !haveNuance) {
const [r3] = await pp.query(sqlSpeechCredentialsForSP, account_sid);
if (r3.length) {
@@ -97,6 +102,10 @@ module.exports = (logger, srf) => {
const nuance = r3.find((s) => s.vendor === 'nuance');
if (nuance) speech.push(speechMapper(nuance));
}
if (!haveDeepgram) {
const deepgram = r3.find((s) => s.vendor === 'deepgram');
if (deepgram) speech.push(speechMapper(deepgram));
}
}
}

View File

@@ -3,52 +3,120 @@ const {
AzureTranscriptionEvents,
GoogleTranscriptionEvents,
AwsTranscriptionEvents,
NuanceTranscriptionEvents
NuanceTranscriptionEvents,
DeepgramTranscriptionEvents,
} = require('./constants');
const normalizeDeepgram = (evt, channel, language) => {
const copy = JSON.parse(JSON.stringify(evt));
const alternatives = (evt.channel?.alternatives || [])
.map((alt) => ({
confidence: alt.confidence,
transcript: alt.transcript,
}));
return {
language_code: language,
channel_tag: channel,
is_final: evt.is_final,
alternatives,
vendor: {
name: 'deepgram',
evt: copy
}
};
};
const normalizeGoogle = (evt, channel, language) => {
const copy = JSON.parse(JSON.stringify(evt));
return {
language_code: language,
channel_tag: channel,
is_final: evt.is_final,
alternatives: evt.alternatives,
vendor: {
name: 'google',
evt: copy
}
};
};
const normalizeNuance = (evt, channel, language) => {
const copy = JSON.parse(JSON.stringify(evt));
return {
language_code: language,
channel_tag: channel,
is_final: evt.is_final,
alternatives: evt.alternatives,
vendor: {
name: 'nuance',
evt: copy
}
};
};
const normalizeMicrosoft = (evt, channel, language) => {
const copy = JSON.parse(JSON.stringify(evt));
const nbest = evt.NBest;
const language_code = evt.PrimaryLanguage?.Language || language;
const alternatives = nbest ? nbest.map((n) => {
return {
confidence: n.Confidence,
transcript: n.Display
};
}) :
[
{
transcript: evt.DisplayText || evt.Text
}
];
return {
language_code,
channel_tag: channel,
is_final: evt.RecognitionStatus === 'Success',
alternatives,
vendor: {
name: 'microsoft',
evt: copy
}
};
};
const normalizeAws = (evt, channel, language) => {
const copy = JSON.parse(JSON.stringify(evt));
return {
language_code: language,
channel_tag: channel,
is_final: evt[0].is_final,
alternatives: evt[0].alternatives,
vendor: {
name: 'aws',
evt: copy
}
};
};
module.exports = (logger) => {
const normalizeTranscription = (evt, vendor, channel, language) => {
let newEvent = JSON.parse(JSON.stringify(evt));
/* add in channel_tag and provide the full vendor-specific event */
newEvent = {
...(vendor === 'aws' ? newEvent[0] : newEvent),
language_code: language,
channel_tag: channel
};
if ('aws' === vendor && Array.isArray(evt) && evt.length > 0) {
newEvent = {
...newEvent,
vendor: {event: evt, name: vendor}
};
logger.debug({ evt, vendor, channel, language }, 'normalizeTranscription');
switch (vendor) {
case 'deepgram':
return normalizeDeepgram(evt, channel, language);
case 'microsoft':
return normalizeMicrosoft(evt, channel, language);
case 'google':
return normalizeGoogle(evt, channel, language);
case 'aws':
return normalizeAws(evt, channel, language);
case 'nuance':
return normalizeNuance(evt, channel, language);
default:
logger.error(`Unknown vendor ${vendor}`);
return evt;
}
else if ('microsoft' === vendor) {
const nbest = evt.NBest;
const language_code = evt.PrimaryLanguage?.Language || language;
const alternatives = nbest ? nbest.map((n) => {
return {
confidence: n.Confidence,
transcript: n.Display
};
}) :
[
{
transcript: evt.DisplayText || evt.Text
}
];
newEvent = {
...newEvent,
is_final: evt.RecognitionStatus === 'Success',
channel,
language_code,
alternatives,
vendor: {event: evt, name: vendor}
};
}
return newEvent;
};
const setChannelVarsForStt = (task, sttCredentials, rOpts = {}) => {
@@ -201,6 +269,48 @@ module.exports = (logger) => {
{NUANCE_RESOURCES: JSON.stringify(nuanceOptions.resources)},
};
}
else if ('deepgram' === rOpts.vendor) {
const {deepgramOptions = {}} = rOpts;
opts = {
...opts,
...(sttCredentials.api_key) &&
{DEEPGRAM_API_KEY: sttCredentials.api_key},
...(deepgramOptions.tier) &&
{DEEPGRAM_SPEECH_TIER: deepgramOptions.tier},
...(deepgramOptions.model) &&
{DEEPGRAM_SPEECH_MODEL: deepgramOptions.model},
...(deepgramOptions.punctuate) &&
{DEEPGRAM_SPEECH_ENABLE_AUTOMATIC_PUNCTUATION: 1},
...(deepgramOptions.profanityFilter) &&
{DEEPGRAM_SPEECH_PROFANITY_FILTER: 1},
...(deepgramOptions.redact) &&
{DEEPGRAM_SPEECH_REDACT: 1},
...(deepgramOptions.diarize) &&
{DEEPGRAM_SPEECH_DIARIZE: 1},
...(deepgramOptions.diarizeVersion) &&
{DEEPGRAM_SPEECH_DIARIZE_VERSION: deepgramOptions.diarizeVersion},
...(deepgramOptions.ner) &&
{DEEPGRAM_SPEECH_NER: 1},
...(deepgramOptions.alternatives) &&
{DEEPGRAM_SPEECH_ALTERNATIVES: deepgramOptions.alternatives},
...(deepgramOptions.numerals) &&
{DEEPGRAM_SPEECH_NUMERALS: deepgramOptions.numerals},
...(deepgramOptions.search) &&
{DEEPGRAM_SPEECH_SEARCH: deepgramOptions.search.join(',')},
...(deepgramOptions.replace) &&
{DEEPGRAM_SPEECH_REPLACE: deepgramOptions.replace.join(',')},
...(rOpts.hints.length > 0 &&
{DEEPGRAM_SPEECH_KEYWORDS: rOpts.hints.join(',')}),
...(deepgramOptions.keywords) &&
{DEEPGRAM_SPEECH_KEYWORDS: deepgramOptions.keywords.join(',')},
...('endpointing' in deepgramOptions) &&
{DEEPGRAM_SPEECH_ENDPOINTING: deepgramOptions.endpointing},
...(deepgramOptions.vadTurnoff) &&
{DEEPGRAM_SPEECH_VAD_TURNOFF: deepgramOptions.vadTurnoff},
...(deepgramOptions.tag) &&
{DEEPGRAM_SPEECH_VAD_TURNOFF: deepgramOptions.tag}
};
}
logger.debug({opts}, 'recognizer channel vars');
return opts;
};
@@ -223,6 +333,11 @@ module.exports = (logger) => {
ep.removeCustomEventListener(NuanceTranscriptionEvents.Error);
ep.removeCustomEventListener(NuanceTranscriptionEvents.VadDetected);
ep.removeCustomEventListener(DeepgramTranscriptionEvents.Transcription);
ep.removeCustomEventListener(DeepgramTranscriptionEvents.Connect);
ep.removeCustomEventListener(DeepgramTranscriptionEvents.ConnectFailure);
};
return {
normalizeTranscription,