deepgram: rework continuous asr, and resolve on speech_final not is_f… (#501)

* deepgram: rework continuous asr, and resolve on speech_final not is_final (wip)

* wip

* deepgram: empty final transcript should trigger resolve with speech if we have buffered transcripts

* wip

* fixes for deepgram compiling multiple transcripts

* test deepgram utteranceEndMs

* more handling of utteranceEndMs

* wip

* better handling of digit strings collected over multiple deepgram responses

* wip

* add support for deepgramOptions.shortUtterance which triggers off of is_final instead of speech_final

* apply deepgram fixes to transcribe

* cleanup continnuous asr

* more continuous asr fixes for deepgram

* update to verb-specifications for handling SttTask properties

* set log level for tests back to error
This commit is contained in:
Dave Horton
2023-10-30 13:57:25 -04:00
committed by GitHub
parent 67f8f7181a
commit f43a5c1491
4 changed files with 179 additions and 60 deletions

View File

@@ -52,6 +52,7 @@ const stickyVars = {
'DEEPGRAM_SPEECH_SEARCH',
'DEEPGRAM_SPEECH_REPLACE',
'DEEPGRAM_SPEECH_ENDPOINTING',
'DEEPGRAM_SPEECH_UTTERANCE_END_MS',
'DEEPGRAM_SPEECH_VAD_TURNOFF',
'DEEPGRAM_SPEECH_TAG'
],
@@ -106,6 +107,53 @@ const stickyVars = {
]
};
const consolidateTranscripts = (bufferedTranscripts, channel, language) => {
if (bufferedTranscripts.length === 1) return bufferedTranscripts[0];
let totalConfidence = 0;
const finalTranscript = bufferedTranscripts.reduce((acc, evt) => {
totalConfidence += evt.alternatives[0].confidence;
let newTranscript = evt.alternatives[0].transcript;
// If new transcript consists only of digits, spaces, and a trailing comma or period
if (newTranscript.match(/^[\d\s]+[,.]?$/)) {
newTranscript = newTranscript.replace(/\s/g, ''); // Remove all spaces
if (newTranscript.endsWith(',')) {
newTranscript = newTranscript.slice(0, -1); // Remove the trailing comma
} else if (newTranscript.endsWith('.')) {
newTranscript = newTranscript.slice(0, -1); // Remove the trailing period
}
}
const lastChar = acc.alternatives[0].transcript.slice(-1);
const firstChar = newTranscript.charAt(0);
if (lastChar.match(/\d/) && firstChar.match(/\d/)) {
acc.alternatives[0].transcript += newTranscript;
} else {
acc.alternatives[0].transcript += ` ${newTranscript}`;
}
return acc;
}, {
language_code: language,
channel_tag: channel,
is_final: true,
alternatives: [{
transcript: ''
}]
});
finalTranscript.alternatives[0].confidence = bufferedTranscripts.length === 1 ?
bufferedTranscripts[0].alternatives[0].confidence :
totalConfidence / bufferedTranscripts.length;
finalTranscript.alternatives[0].transcript = finalTranscript.alternatives[0].transcript.trim();
finalTranscript.vendor = {
name: 'deepgram',
evt: bufferedTranscripts
};
return finalTranscript;
};
const compileSonioxTranscripts = (finalWordChunks, channel, language) => {
const words = finalWordChunks.flat();
const transcript = words.reduce((acc, word) => {
@@ -163,7 +211,7 @@ const normalizeSoniox = (evt, channel, language) => {
};
};
const normalizeDeepgram = (evt, channel, language) => {
const normalizeDeepgram = (evt, channel, language, shortUtterance) => {
const copy = JSON.parse(JSON.stringify(evt));
const alternatives = (evt.channel?.alternatives || [])
.map((alt) => ({
@@ -171,10 +219,14 @@ const normalizeDeepgram = (evt, channel, language) => {
transcript: alt.transcript,
}));
/**
* note difference between is_final and speech_final in Deepgram:
* https://developers.deepgram.com/docs/understand-endpointing-interim-results
*/
return {
language_code: language,
channel_tag: channel,
is_final: evt.is_final,
is_final: shortUtterance ? evt.is_final : evt.speech_final,
alternatives: [alternatives[0]],
vendor: {
name: 'deepgram',
@@ -325,12 +377,12 @@ const normalizeAws = (evt, channel, language) => {
module.exports = (logger) => {
const normalizeTranscription = (evt, vendor, channel, language) => {
const normalizeTranscription = (evt, vendor, channel, language, shortUtterance) => {
//logger.debug({ evt, vendor, channel, language }, 'normalizeTranscription');
switch (vendor) {
case 'deepgram':
return normalizeDeepgram(evt, channel, language);
return normalizeDeepgram(evt, channel, language, shortUtterance);
case 'microsoft':
return normalizeMicrosoft(evt, channel, language);
case 'google':
@@ -536,6 +588,8 @@ module.exports = (logger) => {
{DEEPGRAM_SPEECH_KEYWORDS: deepgramOptions.keywords.join(',')},
...('endpointing' in deepgramOptions) &&
{DEEPGRAM_SPEECH_ENDPOINTING: deepgramOptions.endpointing},
...(deepgramOptions.utteranceEndMs) &&
{DEEPGRAM_SPEECH_UTTERANCE_END_MS: deepgramOptions.utteranceEndMs},
...(deepgramOptions.vadTurnoff) &&
{DEEPGRAM_SPEECH_VAD_TURNOFF: deepgramOptions.vadTurnoff},
...(deepgramOptions.tag) &&
@@ -743,6 +797,7 @@ module.exports = (logger) => {
setChannelVarsForStt,
removeSpeechListeners,
setSpeechCredentialsAtRuntime,
compileSonioxTranscripts
compileSonioxTranscripts,
consolidateTranscripts
};
};