mirror of
https://github.com/jambonz/jambonz-feature-server.git
synced 2025-12-20 08:40:38 +00:00
Feature/deepgram stt (#190)
* initial changes to support deepgram stt * fixes for normalizing vendor-specific transcriptions * update to latest drachtio-fsmrf with support for deepgram stt * deepgram parsing error * hints support for deepgram * handling deepgram errors * ignore late arriving transcripts for deepgram * handling of empty transcripts * transcribe changes * allow deepgram stt credentials to be provided at run time * bind channel in transcription handler * fixes for transcribe when handling empty transcripts * more empty transcript fixes * update tests to latest modules * add test cases for deepgram speech recognition
This commit is contained in:
@@ -3,52 +3,120 @@ const {
|
||||
AzureTranscriptionEvents,
|
||||
GoogleTranscriptionEvents,
|
||||
AwsTranscriptionEvents,
|
||||
NuanceTranscriptionEvents
|
||||
NuanceTranscriptionEvents,
|
||||
DeepgramTranscriptionEvents,
|
||||
} = require('./constants');
|
||||
|
||||
const normalizeDeepgram = (evt, channel, language) => {
|
||||
const copy = JSON.parse(JSON.stringify(evt));
|
||||
const alternatives = (evt.channel?.alternatives || [])
|
||||
.map((alt) => ({
|
||||
confidence: alt.confidence,
|
||||
transcript: alt.transcript,
|
||||
}));
|
||||
|
||||
return {
|
||||
language_code: language,
|
||||
channel_tag: channel,
|
||||
is_final: evt.is_final,
|
||||
alternatives,
|
||||
vendor: {
|
||||
name: 'deepgram',
|
||||
evt: copy
|
||||
}
|
||||
};
|
||||
};
|
||||
|
||||
const normalizeGoogle = (evt, channel, language) => {
|
||||
const copy = JSON.parse(JSON.stringify(evt));
|
||||
return {
|
||||
language_code: language,
|
||||
channel_tag: channel,
|
||||
is_final: evt.is_final,
|
||||
alternatives: evt.alternatives,
|
||||
vendor: {
|
||||
name: 'google',
|
||||
evt: copy
|
||||
}
|
||||
};
|
||||
};
|
||||
|
||||
const normalizeNuance = (evt, channel, language) => {
|
||||
const copy = JSON.parse(JSON.stringify(evt));
|
||||
return {
|
||||
language_code: language,
|
||||
channel_tag: channel,
|
||||
is_final: evt.is_final,
|
||||
alternatives: evt.alternatives,
|
||||
vendor: {
|
||||
name: 'nuance',
|
||||
evt: copy
|
||||
}
|
||||
};
|
||||
};
|
||||
|
||||
const normalizeMicrosoft = (evt, channel, language) => {
|
||||
const copy = JSON.parse(JSON.stringify(evt));
|
||||
const nbest = evt.NBest;
|
||||
const language_code = evt.PrimaryLanguage?.Language || language;
|
||||
const alternatives = nbest ? nbest.map((n) => {
|
||||
return {
|
||||
confidence: n.Confidence,
|
||||
transcript: n.Display
|
||||
};
|
||||
}) :
|
||||
[
|
||||
{
|
||||
transcript: evt.DisplayText || evt.Text
|
||||
}
|
||||
];
|
||||
|
||||
return {
|
||||
language_code,
|
||||
channel_tag: channel,
|
||||
is_final: evt.RecognitionStatus === 'Success',
|
||||
alternatives,
|
||||
vendor: {
|
||||
name: 'microsoft',
|
||||
evt: copy
|
||||
}
|
||||
};
|
||||
};
|
||||
|
||||
const normalizeAws = (evt, channel, language) => {
|
||||
const copy = JSON.parse(JSON.stringify(evt));
|
||||
return {
|
||||
language_code: language,
|
||||
channel_tag: channel,
|
||||
is_final: evt[0].is_final,
|
||||
alternatives: evt[0].alternatives,
|
||||
vendor: {
|
||||
name: 'aws',
|
||||
evt: copy
|
||||
}
|
||||
};
|
||||
};
|
||||
|
||||
|
||||
module.exports = (logger) => {
|
||||
const normalizeTranscription = (evt, vendor, channel, language) => {
|
||||
let newEvent = JSON.parse(JSON.stringify(evt));
|
||||
|
||||
/* add in channel_tag and provide the full vendor-specific event */
|
||||
newEvent = {
|
||||
...(vendor === 'aws' ? newEvent[0] : newEvent),
|
||||
language_code: language,
|
||||
channel_tag: channel
|
||||
};
|
||||
|
||||
|
||||
if ('aws' === vendor && Array.isArray(evt) && evt.length > 0) {
|
||||
newEvent = {
|
||||
...newEvent,
|
||||
vendor: {event: evt, name: vendor}
|
||||
};
|
||||
logger.debug({ evt, vendor, channel, language }, 'normalizeTranscription');
|
||||
switch (vendor) {
|
||||
case 'deepgram':
|
||||
return normalizeDeepgram(evt, channel, language);
|
||||
case 'microsoft':
|
||||
return normalizeMicrosoft(evt, channel, language);
|
||||
case 'google':
|
||||
return normalizeGoogle(evt, channel, language);
|
||||
case 'aws':
|
||||
return normalizeAws(evt, channel, language);
|
||||
case 'nuance':
|
||||
return normalizeNuance(evt, channel, language);
|
||||
default:
|
||||
logger.error(`Unknown vendor ${vendor}`);
|
||||
return evt;
|
||||
}
|
||||
else if ('microsoft' === vendor) {
|
||||
const nbest = evt.NBest;
|
||||
const language_code = evt.PrimaryLanguage?.Language || language;
|
||||
const alternatives = nbest ? nbest.map((n) => {
|
||||
return {
|
||||
confidence: n.Confidence,
|
||||
transcript: n.Display
|
||||
};
|
||||
}) :
|
||||
[
|
||||
{
|
||||
transcript: evt.DisplayText || evt.Text
|
||||
}
|
||||
];
|
||||
|
||||
newEvent = {
|
||||
...newEvent,
|
||||
is_final: evt.RecognitionStatus === 'Success',
|
||||
channel,
|
||||
language_code,
|
||||
alternatives,
|
||||
vendor: {event: evt, name: vendor}
|
||||
};
|
||||
}
|
||||
return newEvent;
|
||||
};
|
||||
|
||||
const setChannelVarsForStt = (task, sttCredentials, rOpts = {}) => {
|
||||
@@ -201,6 +269,48 @@ module.exports = (logger) => {
|
||||
{NUANCE_RESOURCES: JSON.stringify(nuanceOptions.resources)},
|
||||
};
|
||||
}
|
||||
else if ('deepgram' === rOpts.vendor) {
|
||||
const {deepgramOptions = {}} = rOpts;
|
||||
opts = {
|
||||
...opts,
|
||||
...(sttCredentials.api_key) &&
|
||||
{DEEPGRAM_API_KEY: sttCredentials.api_key},
|
||||
...(deepgramOptions.tier) &&
|
||||
{DEEPGRAM_SPEECH_TIER: deepgramOptions.tier},
|
||||
...(deepgramOptions.model) &&
|
||||
{DEEPGRAM_SPEECH_MODEL: deepgramOptions.model},
|
||||
...(deepgramOptions.punctuate) &&
|
||||
{DEEPGRAM_SPEECH_ENABLE_AUTOMATIC_PUNCTUATION: 1},
|
||||
...(deepgramOptions.profanityFilter) &&
|
||||
{DEEPGRAM_SPEECH_PROFANITY_FILTER: 1},
|
||||
...(deepgramOptions.redact) &&
|
||||
{DEEPGRAM_SPEECH_REDACT: 1},
|
||||
...(deepgramOptions.diarize) &&
|
||||
{DEEPGRAM_SPEECH_DIARIZE: 1},
|
||||
...(deepgramOptions.diarizeVersion) &&
|
||||
{DEEPGRAM_SPEECH_DIARIZE_VERSION: deepgramOptions.diarizeVersion},
|
||||
...(deepgramOptions.ner) &&
|
||||
{DEEPGRAM_SPEECH_NER: 1},
|
||||
...(deepgramOptions.alternatives) &&
|
||||
{DEEPGRAM_SPEECH_ALTERNATIVES: deepgramOptions.alternatives},
|
||||
...(deepgramOptions.numerals) &&
|
||||
{DEEPGRAM_SPEECH_NUMERALS: deepgramOptions.numerals},
|
||||
...(deepgramOptions.search) &&
|
||||
{DEEPGRAM_SPEECH_SEARCH: deepgramOptions.search.join(',')},
|
||||
...(deepgramOptions.replace) &&
|
||||
{DEEPGRAM_SPEECH_REPLACE: deepgramOptions.replace.join(',')},
|
||||
...(rOpts.hints.length > 0 &&
|
||||
{DEEPGRAM_SPEECH_KEYWORDS: rOpts.hints.join(',')}),
|
||||
...(deepgramOptions.keywords) &&
|
||||
{DEEPGRAM_SPEECH_KEYWORDS: deepgramOptions.keywords.join(',')},
|
||||
...('endpointing' in deepgramOptions) &&
|
||||
{DEEPGRAM_SPEECH_ENDPOINTING: deepgramOptions.endpointing},
|
||||
...(deepgramOptions.vadTurnoff) &&
|
||||
{DEEPGRAM_SPEECH_VAD_TURNOFF: deepgramOptions.vadTurnoff},
|
||||
...(deepgramOptions.tag) &&
|
||||
{DEEPGRAM_SPEECH_VAD_TURNOFF: deepgramOptions.tag}
|
||||
};
|
||||
}
|
||||
logger.debug({opts}, 'recognizer channel vars');
|
||||
return opts;
|
||||
};
|
||||
@@ -223,6 +333,11 @@ module.exports = (logger) => {
|
||||
ep.removeCustomEventListener(NuanceTranscriptionEvents.Error);
|
||||
ep.removeCustomEventListener(NuanceTranscriptionEvents.VadDetected);
|
||||
|
||||
ep.removeCustomEventListener(DeepgramTranscriptionEvents.Transcription);
|
||||
ep.removeCustomEventListener(DeepgramTranscriptionEvents.Connect);
|
||||
ep.removeCustomEventListener(DeepgramTranscriptionEvents.ConnectFailure);
|
||||
|
||||
|
||||
};
|
||||
return {
|
||||
normalizeTranscription,
|
||||
|
||||
Reference in New Issue
Block a user